diff --git a/.cirrus.yml b/.cirrus.yml new file mode 100644 index 0000000000..18b292289e --- /dev/null +++ b/.cirrus.yml @@ -0,0 +1,21 @@ +env: + CIRRUS_CLONE_DEPTH: 1 + ARCH: amd64 + +build_task: + matrix: + freebsd_instance: + image_family: freebsd-12-4 + freebsd_instance: + image_family: freebsd-13-2 + freebsd_instance: + image_family: freebsd-14-0-snap + prepare_script: + - pkg install -y autoconf automake libtool gettext-runtime gmake ksh93 py39-packaging py39-cffi py39-sysctl + configure_script: + - env MAKE=gmake ./autogen.sh + - env MAKE=gmake ./configure --with-config="user" --with-python=3.9 + build_script: + - gmake -j `sysctl -n kern.smp.cpus` + install_script: + - gmake install diff --git a/.gitignore b/.gitignore index e5ee9cacec..bd3d4befd2 100644 --- a/.gitignore +++ b/.gitignore @@ -42,8 +42,10 @@ !udev/** !.editorconfig +!.cirrus.yml !.gitignore !.gitmodules +!.mailmap !AUTHORS !autogen.sh !CODE_OF_CONDUCT.md @@ -60,7 +62,6 @@ !TEST !zfs.release.in - # # Normal rules # diff --git a/.mailmap b/.mailmap new file mode 100644 index 0000000000..46ef016b93 --- /dev/null +++ b/.mailmap @@ -0,0 +1,189 @@ +# This file maps the name+email seen in a commit back to a canonical +# name+email. Git will replace the commit name/email with the canonical version +# wherever it sees it. +# +# If there is a commit in the history with a "wrong" name or email, list it +# here. If you regularly commit with an alternate name or email address and +# would like to ensure that you are always listed consistently in the repo, add +# mapping here. +# +# On the other hand, if you use multiple names or email addresses legitimately +# (eg you use a company email address for your paid OpenZFS work, and a +# personal address for your evening side projects), then don't map one to the +# other here. +# +# The most common formats are: +# +# Canonical Name +# Canonical Name +# Canonical Name Commit Name +# +# See https://git-scm.com/docs/gitmailmap for more info. + +# These maps are making names consistent where they have varied but the email +# address has never changed. In most cases, the full name is in the +# Signed-off-by of a commit with a matching author. +Ahelenia Ziemiańska +Ahelenia Ziemiańska +Alex John +Andreas Dilger +Andrew Walker +Benedikt Neuffer +Chengfei Zhu +Chris Lindee +Colm Buckley +Crag Wang +Damian Szuberski +Daniel Kolesa +Debabrata Banerjee +Finix Yan +Gaurav Kumar +Gionatan Danti +Glenn Washburn +Gordan Bobic +Gregory Bartholomew +hedong zhang +InsanePrawn +Jason Cohen +Jason Harmening +Jeremy Faulkner +Jinshan Xiong +John Poduska +Justin Scholz +Ka Ho Ng +Kash Pande +Kay Pedersen +KernelOfTruth +Liu Hua +Liu Qing +loli10K +Matthias Blankertz +Michael Gmelin +Olivier Mazouffre +Piotr Kubaj +Quentin Zdanis +Roberto Ricci +Rob Norris +Rob Norris +Sam Lunt +Sanjeev Bagewadi +Stoiko Ivanov +Tamas TEVESZ +WHR +Yanping Gao +Youzhong Yang + +# Commits from strange places, long ago +Brian Behlendorf +Brian Behlendorf +Brian Behlendorf +Brian Behlendorf +Brian Behlendorf +Herb Wartens +Ned Bass +Tulsi Jain + +# Mappings from Github no-reply addresses +ajs124 +Alek Pinchuk +Alexander Lobakin +Alexey Smirnoff +Allen Holl <65494904+allen-4@users.noreply.github.com> +Ameer Hamza <106930537+ixhamza@users.noreply.github.com> +Andrew J. Hesford <48421688+ahesford@users.noreply.github.com>> +Andrew Sun +Aron Xu +Arun KV <65647132+arun-kv@users.noreply.github.com> +Ben Wolsieffer +bernie1995 <42413912+bernie1995@users.noreply.github.com> +Boris Protopopov +Brad Forschinger +Brandon Thetford +buzzingwires <131118055+buzzingwires@users.noreply.github.com> +Cedric Maunoury <38213715+cedricmaunoury@users.noreply.github.com> +Charles Suh +Dacian Reece-Stremtan <35844628+dacianstremtan@users.noreply.github.com> +Damian Szuberski <30863496+szubersk@users.noreply.github.com> +Daniel Hiepler <32984777+heeplr@users.noreply.github.com> +Daniel Kobras +Daniel Reichelt +David Quigley +DHE +Dmitri John Ledkov <19779+xnox@users.noreply.github.com> +Dries Michiels <32487486+driesmp@users.noreply.github.com> +Edmund Nadolski <137826107+ednadolski-ix@users.noreply.github.com> +Érico Nogueira <34201958+ericonr@users.noreply.github.com> +Fedor Uporov <60701163+fuporovvStack@users.noreply.github.com> +Felix Dörre +Felix Neumärker <34678034+xdch47@users.noreply.github.com> +Finix Yan +Gaurav Kumar +George Gaydarov +Georgy Yakovlev <168902+gyakovlev@users.noreply.github.com> +Gerardwx +Gian-Carlo DeFazio +Giuseppe Di Natale +Hajo Möller +Harry Mallon <1816667+hjmallon@users.noreply.github.com> +Hiếu Lê +Jake Howard +James Cowgill +Jason King +Jeff Dike <52420226+jdike@users.noreply.github.com> +Jitendra Patidar <53164267+jsai20@users.noreply.github.com> +João Carlos Mendes Luís +John Eismeier <32205350+jeis2497052@users.noreply.github.com> +John L. Hammond <35266395+jhammond-intel@users.noreply.github.com> +John-Mark Gurney +John Ramsden +Jonathon Fernyhough <559369+jonathonf@users.noreply.github.com> +Justin Hibbits +Kevin Jin <33590050+jxdking@users.noreply.github.com> +Kevin P. Fleming +Krzysztof Piecuch <3964215+pikrzysztof@users.noreply.github.com> +Kyle Evans +Laurențiu Nicola +loli10K +Lorenz Hüdepohl +Luís Henriques <73643340+lumigch@users.noreply.github.com> +Marcin Skarbek +Matt Fiddaman <81489167+matt-fidd@users.noreply.github.com> +Max Zettlmeißl <6818198+maxz@users.noreply.github.com> +Michael Niewöhner +Michael Zhivich <33133421+mzhivich@users.noreply.github.com> +Mo Zhou <5723047+cdluminate@users.noreply.github.com> +Nick Mattis +omni <79493359+omnivagant@users.noreply.github.com> +Pablo Correa Gómez <32678034+pablofsf@users.noreply.github.com> +Paul Zuchowski <31706010+PaulZ-98@users.noreply.github.com> +Peter Ashford +Peter Dave Hello +Peter Wirdemo <4224155+pewo@users.noreply.github.com> +Petros Koutoupis +Ping Huang <101400146+hpingfs@users.noreply.github.com> +Piotr P. Stefaniak +Richard Allen <33836503+belperite@users.noreply.github.com> +Rich Ercolani <214141+rincebrain@users.noreply.github.com> +Rob Wing <98866084+rob-wing@users.noreply.github.com> +Roman Strashkin +Ryan Hirasaki <4690732+RyanHir@users.noreply.github.com> +Samuel Wycliffe J <115969550+samwyc@users.noreply.github.com> +Samuel Wycliffe <50765275+npc203@users.noreply.github.com> +Savyasachee Jha +Scott Colby +Sean Eric Fagan +Spencer Kinny <30333052+Spencer-Kinny@users.noreply.github.com> +Srikanth N S <75025422+nssrikanth@users.noreply.github.com> +Thomas Geppert +Tim Crawford +Tom Matthews +Tony Perkins <62951051+tony-zfs@users.noreply.github.com> +Torsten Wörtwein +Tulsi Jain +Václav Skála <33496485+vaclavskala@users.noreply.github.com> +Violet Purcell <66446404+vimproved@users.noreply.github.com> +Vipin Kumar Verma <75025470+vermavipinkumar@users.noreply.github.com> +Wolfgang Bumiller +xtouqh <72357159+xtouqh@users.noreply.github.com> +Yuri Pankov <113725409+yuripv@users.noreply.github.com> +Yuri Pankov <82001006+yuripv@users.noreply.github.com> diff --git a/AUTHORS b/AUTHORS index c2af58d750..be1efb87b3 100644 --- a/AUTHORS +++ b/AUTHORS @@ -10,228 +10,450 @@ PAST MAINTAINERS: CONTRIBUTORS: Aaron Fineman + Adam D. Moss Adam Leventhal Adam Stevko + adisbladis + Adrian Chadd + Ahelenia Ziemiańska Ahmed G + Aidan Harris + AJ Jordan + ajs124 Akash Ayare + Akash B Alan Somers Alar Aun Albert Lee Alec Salazar + Alejandro Colomar Alejandro R. Sedeño Alek Pinchuk Aleksa Sarai + Alexander Eremin + Alexander Lobakin + Alexander Motin + Alexander Pyhalov + Alexander Richardson + Alexander Stetsenko Alex Braunegg + Alexey Shvetsov + Alexey Smirnoff + Alex John Alex McWhirter Alex Reece Alex Wilson Alex Zhuravlev - Alexander Eremin - Alexander Motin - Alexander Pyhalov - Alexander Stetsenko - Alexey Shvetsov - Alexey Smirnoff Allan Jude + Allen Holl + alteriks + Alyssa Ross + Ameer Hamza + Anatoly Borodin AndCycle + Andrea Gelmini + Andrea Righi Andreas Buschmann Andreas Dilger + Andreas Vögele Andrew Barnes Andrew Hamilton + Andrew Innes + Andrew J. Hesford Andrew Reid Andrew Stormont + Andrew Sun Andrew Tselischev + Andrew Turner + Andrew Walker + Andrey Prokopenko Andrey Vesnovaty Andriy Gapon Andy Bakun + Andy Fiddaman Aniruddha Shankar + Anton Gubarkov Antonio Russo Arkadiusz Bubała + Armin Wehrfritz Arne Jansen Aron Xu + Arshad Hussain + Arun KV + Arvind Sankar + Attila Fülöp + Avatat Bart Coddens Basil Crow - Huang Liu + Bassu Ben Allen - Ben Rubson + Ben Cordero + Benedikt Neuffer Benjamin Albrecht + Benjamin Gentil + Ben McGough + Ben Rubson + Ben Wolsieffer + bernie1995 Bill McGonigle Bill Pijewski Boris Protopopov + Brad Forschinger Brad Lewis + Brandon Thetford + Brian Atkinson Brian Behlendorf Brian J. Murrell + Brooks Davis + BtbN + bunder2015 + buzzingwires + bzzz77 + cable2999 Caleb James DeLisle Cao Xuewen Carlo Landmeter Carlos Alberto Lopez Perez + Cedric Maunoury Chaoyu Zhang + Charles Suh Chen Can + Chengfei Zhu Chen Haiquan Chip Parker Chris Burroughs Chris Dunlap Chris Dunlop + Chris Lindee + Chris McDonough Chris Siden - Chris Wedgwood - Chris Williamson - Chris Zubrzycki - Christ Schlacta + Chris Siebenmann Christer Ekholm Christian Kohlschütter Christian Neukirchen Christian Schwarz Christopher Voltz + Christ Schlacta + Chris Wedgwood + Chris Williamson + Chris Zubrzycki + Chuck Tuffli Chunwei Chen Clemens Fruhwirth + Clemens Lang + Clint Armstrong Coleman Kane Colin Ian King + Colm Buckley + Crag Wang Craig Loomis Craig Sanders Cyril Plisko - DHE + Cy Schubert + Cédric Berger + Dacian Reece-Stremtan + Dag-Erling Smørgrav + Damiano Albani + Damian Szuberski Damian Wojsław + Daniel Hiepler + Daniel Hoffman + Daniel Kobras + Daniel Kolesa + Daniel Reichelt + Daniel Stevenson + Daniel Verite + Daniil Lunev Dan Kimmel Dan McDonald Dan Swartzendruber Dan Vatca - Daniel Hoffman - Daniel Verite - Daniil Lunev Darik Horn Dave Eddy + David Hedberg David Lamparter David Qian David Quigley Debabrata Banerjee + D. Ebdrup Denys Rtveliashvili Derek Dai + DHE + Didier Roche Dimitri John Ledkov + Dimitry Andric + Dirkjan Bussink Dmitry Khasanov + Dominic Pearson Dominik Hassler Dominik Honnef Don Brady + Doug Rabson Dr. András Korn + Dries Michiels + Edmund Nadolski + Eitan Adler Eli Rosenthal + Eli Schwartz Eric Desrochers Eric Dillmann Eric Schrock + Ethan Coe-Renner Etienne Dechamps + Evan Allrich + Evan Harris Evan Susarret Fabian Grünbichler + Fabio Buso + Fabio Scaccabarozzi Fajar A. Nugraha Fan Yong + fbynite + Fedor Uporov + Felix Dörre + Felix Neumärker Feng Sun + Finix Yan + Francesco Mazzoli Frederik Wessels Frédéric Vanniere + Gabriel A. Devenyi Garrett D'Amore + Garrett Fields Garrison Jensen Gary Mills Gaurav Kumar GeLiXin George Amanakis + George Diamantopoulos + George Gaydarov George Melikov George Wilson Georgy Yakovlev + Gerardwx + Gian-Carlo DeFazio + Gionatan Danti Giuseppe Di Natale + Glenn Washburn Gordan Bobic + Gordon Bergling Gordon Ross + Graham Christensen + Graham Perrin Gregor Kopka + Gregory Bartholomew + grembo Grischa Zengel + grodik Gunnar Beutner Gvozden Neskovic Hajo Möller + Han Gao Hans Rosenfeld + Harald van Dijk + Harry Mallon + Harry Sintonen + HC + hedong zhang + Heitor Alves de Siqueira + Henrik Riomar + Herb Wartens + Hiếu Lê + Huang Liu Håkan Johansson + Igor K Igor Kozhukhov Igor Lvovsky + ilbsmart + illiliti + ilovezfs + InsanePrawn Isaac Huang - JK Dingwall Jacek Fefliński + Jacob Adams + Jake Howard James Cowgill + James H James Lee James Pan + James Wah Jan Engelhardt Jan Kryl Jan Sanislo + Jason Cohen + Jason Harmening Jason King Jason Zaman Javen Wu + Jean-Baptiste Lallement + Jeff Dike + Jeremy Faulkner Jeremy Gill Jeremy Jones + Jeremy Visser Jerry Jelinek + Jessica Clarke Jinshan Xiong + Jitendra Patidar + JK Dingwall Joe Stein + John-Mark Gurney John Albietz John Eismeier - John L. Hammond + John Gallagher John Layman - John Paul Adrian Glaubitz - John Wren Kennedy + John L. Hammond + John M. Layman Johnny Stenback + John Paul Adrian Glaubitz + John Poduska + John Ramsden + John Wren Kennedy + jokersus + Jonathon Fernyhough Jorgen Lundman Josef 'Jeff' Sipek + Josh Soref Joshua M. Clulow + José Luis Salvador Rufo + João Carlos Mendes Luís + Julian Brunner + Julian Heuking + jumbi77 Justin Bedő + Justin Gottula + Justin Hibbits + Justin Keogh Justin Lecher + Justin Scholz Justin T. Gibbs + jyxent Jörg Thalheim - KORN Andras + ka7 + Ka Ho Ng Kamil Domański Karsten Kretschmer Kash Pande + Kay Pedersen Keith M Wesolowski + KernelOfTruth + Kevin Bowling + Kevin Jin + Kevin P. Fleming Kevin Tanguy KireinaHoro Kjeld Schouten-Lebbing + Kleber Tarcísio + Kody A Kantor Kohsuke Kawaguchi + Konstantin Khorenko + KORN Andras + Kristof Provost + Krzysztof Piecuch Kyle Blatter + Kyle Evans Kyle Fuller - Loli + Laevos + Lalufu Lars Johannsen + Laura Hild + Laurențiu Nicola + Lauri Tirkkonen + liaoyuxiangqin Li Dongyang + Liu Hua + Liu Qing Li Wei + Loli + lorddoskias + Lorenz Brun + Lorenz Hüdepohl + louwrentius Lukas Wunner + luozhengzheng + Luís Henriques Madhav Suresh + manfromafar Manoj Joseph Manuel Amador (Rudd-O) Marcel Huber + Marcel Menzel + Marcel Schilling Marcel Telka Marcel Wysocki + Marcin Skarbek + Mariusz Zaborski + Mark Johnston + Mark Maybee + Mark Roper Mark Shellenbaum + marku89 Mark Wright Martin Matuska + Martin Rüegg Massimo Maggi - Matt Johnston - Matt Kemp + Mateusz Guzik + Mateusz Piotrowski <0mp@FreeBSD.org> + Mathieu Velten + Matt Fiddaman Matthew Ahrens Matthew Thode + Matthias Blankertz + Matt Johnston + Matt Kemp + Matt Macy Matus Kral + Mauricio Faria de Oliveira Max Grossman Maximilian Mehnert + Max Zettlmeißl + Md Islam + megari + Michael D Labriola + Michael Franzl Michael Gebetsroither Michael Kjorling Michael Martin Michael Niewöhner + Michael Zhivich + Michal Vasilek Mike Gerdts Mike Harsch Mike Leddy Mike Swanson Milan Jurik + Minsoo Choo + Mohamed Tawfik Morgan Jones Moritz Maxeiner + Mo Zhou + naivekun + nathancheek Nathaniel Clark Nathaniel Wesley Filardo + Nathan Lewis Nav Ravindranath Neal Gompa (ニール・ゴンパ) Ned Bass Neependra Khare Neil Stockbridge + Nick Black Nick Garvey + Nick Mattis + Nick Terrell + Niklas Haas Nikolay Borisov + nordaux + ofthesun9 Olaf Faaland Oleg Drokin Oleg Stepura + Olivier Mazouffre + omni + Orivej Desh + Pablo Correa Gómez + Palash Gandhi + Patrick Mooney Patrik Greco Paul B. Henson Paul Dagnelie @@ -243,69 +465,160 @@ CONTRIBUTORS: Pedro Giffuni Peng Peter Ashford + Peter Dave Hello + Peter Levine + Peter Wirdemo + Petros Koutoupis + Philip Pokorny + Philipp Riederer + Phil Kauffman + Ping Huang + Piotr Kubaj + Piotr P. Stefaniak Prakash Surya Prasad Joshi + privb0x23 + P.SCH + Quentin Zdanis + Rafael Kitover + RageLtMan Ralf Ertzinger Randall Mason Remy Blank + renelson + Reno Reckling Ricardo M. Correia - Rich Ercolani + Riccardo Schirone + Richard Allen Richard Elling Richard Laager Richard Lowe Richard Sharpe Richard Yao + Rich Ercolani + Robert Novak + Roberto Ricci + Rob Norris + Rob Wing Rohan Puri Romain Dolbeau Roman Strashkin + Ross Williams Ruben Kerkhof + Ryan Hirasaki + Ryan Lahfa + Ryan Libby + Ryan Moeller + Sam Hathaway + Sam Lunt + Samuel VERSCHELDE + Samuel Wycliffe + Samuel Wycliffe J + Sanjeev Bagewadi + Sara Hartse Saso Kiselkov + Satadru Pramanik + Savyasachee Jha + Scott Colby Scot W. Stevenson Sean Eric Fagan Sebastian Gottschall + Sebastien Roy Sen Haerens Serapheim Dimitropoulos Seth Forshee + Shaan Nobee Shampavman + Shaun Tancheff Shen Yan Simon Guest Simon Klinkert Sowrabha Gopal + Spencer Kinny + Srikanth N S Stanislav Seletskiy Steffen Müthing Stephen Blinick + sterlingjensen Steve Dougherty + Steve Mokris Steven Burgess Steven Hartland Steven Johnson + Steven Noonan + stf Stian Ellingsen + Stoiko Ivanov + Stéphane Lesimple Suman Chakravartula Sydney Vanda Sören Tempel + Tamas TEVESZ + Teodor Spæren + TerraTech Thijs Cramer + Thomas Geppert + Thomas Lamprecht + Till Maas Tim Chase Tim Connors Tim Crawford Tim Haley + timor + Timothy Day + Tim Schumacher Tino Reichardt Tobin Harding Tom Caputi Tom Matthews - Tom Prince Tomohiro Kusumi + Tom Prince Tony Hutter + Tony Nguyen + Tony Perkins Toomas Soome + Torsten Wörtwein + Toyam Cox + Trevor Bautista Trey Dockendorf + Troels Nørgaard + Tulsi Jain Turbo Fredriksson Tyler J. Stachecki + Umer Saleem + Valmiky Arquissandas + Val Packett + Vince van Oosten + Violet Purcell + Vipin Kumar Verma Vitaut Bajaryn + Volker Mauel + Václav Skála + Walter Huf + Warner Losh Weigang Li + WHR Will Andrews Will Rouesnel + Windel Bouwman + Wojciech Małota-Wójcik Wolfgang Bumiller Xin Li + Xinliang Liu + xtouqh + Yann Collet + Yanping Gao Ying Zhu + Youzhong Yang + yparitcher + yuina822 YunQiang Su Yuri Pankov Yuxuan Shui Zachary Bedell + Zach Dykstra + zgock + Zhu Chuang + Érico Nogueira + Đoàn Trần Công Danh + 韩朴宇 diff --git a/META b/META index 9ffe90458d..5868838a26 100644 --- a/META +++ b/META @@ -1,10 +1,10 @@ Meta: 1 Name: zfs Branch: 1.0 -Version: 2.2.0 -Release: rc4 +Version: 2.2.1 +Release: 1 Release-Tags: relext License: CDDL Author: OpenZFS -Linux-Maximum: 6.5 +Linux-Maximum: 6.6 Linux-Minimum: 3.10 diff --git a/cmd/arc_summary b/cmd/arc_summary index 426e020705..9c69ec4f8c 100755 --- a/cmd/arc_summary +++ b/cmd/arc_summary @@ -711,7 +711,7 @@ def section_archits(kstats_dict): pd_total = int(arc_stats['prefetch_data_hits']) +\ int(arc_stats['prefetch_data_iohits']) +\ int(arc_stats['prefetch_data_misses']) - prt_2('ARC prefetch metadata accesses:', f_perc(pd_total, all_accesses), + prt_2('ARC prefetch data accesses:', f_perc(pd_total, all_accesses), f_hits(pd_total)) pd_todo = (('Prefetch data hits:', arc_stats['prefetch_data_hits']), ('Prefetch data I/O hits:', arc_stats['prefetch_data_iohits']), diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 4b9921d47b..005bf3f165 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -5179,7 +5179,7 @@ dump_label(const char *dev) if (nvlist_size(config, &size, NV_ENCODE_XDR) != 0) size = buflen; - /* If the device is a cache device clear the header. */ + /* If the device is a cache device read the header. */ if (!read_l2arc_header) { if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &l2cache) == 0 && diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c index ad52acda8e..dca6d9ec5c 100644 --- a/cmd/zed/agents/zfs_mod.c +++ b/cmd/zed/agents/zfs_mod.c @@ -24,6 +24,7 @@ * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2016, 2017, Intel Corporation. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. + * Copyright (c) 2023, Klara Inc. */ /* @@ -146,6 +147,17 @@ zfs_unavail_pool(zpool_handle_t *zhp, void *data) return (0); } +/* + * Write an array of strings to the zed log + */ +static void lines_to_zed_log_msg(char **lines, int lines_cnt) +{ + int i; + for (i = 0; i < lines_cnt; i++) { + zed_log_msg(LOG_INFO, "%s", lines[i]); + } +} + /* * Two stage replace on Linux * since we get disk notifications @@ -193,14 +205,21 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) uint64_t is_spare = 0; const char *physpath = NULL, *new_devid = NULL, *enc_sysfs_path = NULL; char rawpath[PATH_MAX], fullpath[PATH_MAX]; - char devpath[PATH_MAX]; + char pathbuf[PATH_MAX]; int ret; int online_flag = ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE; boolean_t is_sd = B_FALSE; boolean_t is_mpath_wholedisk = B_FALSE; uint_t c; vdev_stat_t *vs; + char **lines = NULL; + int lines_cnt = 0; + /* + * Get the persistent path, typically under the '/dev/disk/by-id' or + * '/dev/disk/by-vdev' directories. Note that this path can change + * when a vdev is replaced with a new disk. + */ if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0) return; @@ -359,15 +378,17 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) (void) snprintf(rawpath, sizeof (rawpath), "%s%s", is_sd ? DEV_BYVDEV_PATH : DEV_BYPATH_PATH, physpath); - if (realpath(rawpath, devpath) == NULL && !is_mpath_wholedisk) { + if (realpath(rawpath, pathbuf) == NULL && !is_mpath_wholedisk) { zed_log_msg(LOG_INFO, " realpath: %s failed (%s)", rawpath, strerror(errno)); - (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, - &newstate); + int err = zpool_vdev_online(zhp, fullpath, + ZFS_ONLINE_FORCEFAULT, &newstate); - zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s)", - fullpath, libzfs_error_description(g_zfshdl)); + zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s) " + "err %d, new state %d", + fullpath, libzfs_error_description(g_zfshdl), err, + err ? (int)newstate : 0); return; } @@ -385,6 +406,22 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) if (is_mpath_wholedisk) { /* Don't label device mapper or multipath disks. */ + zed_log_msg(LOG_INFO, + " it's a multipath wholedisk, don't label"); + if (zpool_prepare_disk(zhp, vdev, "autoreplace", &lines, + &lines_cnt) != 0) { + zed_log_msg(LOG_INFO, + " zpool_prepare_disk: could not " + "prepare '%s' (%s)", fullpath, + libzfs_error_description(g_zfshdl)); + if (lines_cnt > 0) { + zed_log_msg(LOG_INFO, + " zfs_prepare_disk output:"); + lines_to_zed_log_msg(lines, lines_cnt); + } + libzfs_free_str_array(lines, lines_cnt); + return; + } } else if (!labeled) { /* * we're auto-replacing a raw disk, so label it first @@ -401,16 +438,24 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) * to trigger a ZFS fault for the device (and any hot spare * replacement). */ - leafname = strrchr(devpath, '/') + 1; + leafname = strrchr(pathbuf, '/') + 1; /* * If this is a request to label a whole disk, then attempt to * write out the label. */ - if (zpool_label_disk(g_zfshdl, zhp, leafname) != 0) { - zed_log_msg(LOG_INFO, " zpool_label_disk: could not " + if (zpool_prepare_and_label_disk(g_zfshdl, zhp, leafname, + vdev, "autoreplace", &lines, &lines_cnt) != 0) { + zed_log_msg(LOG_WARNING, + " zpool_prepare_and_label_disk: could not " "label '%s' (%s)", leafname, libzfs_error_description(g_zfshdl)); + if (lines_cnt > 0) { + zed_log_msg(LOG_INFO, + " zfs_prepare_disk output:"); + lines_to_zed_log_msg(lines, lines_cnt); + } + libzfs_free_str_array(lines, lines_cnt); (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, &newstate); @@ -433,7 +478,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) sizeof (device->pd_physpath)); list_insert_tail(&g_device_list, device); - zed_log_msg(LOG_INFO, " zpool_label_disk: async '%s' (%llu)", + zed_log_msg(LOG_NOTICE, " zpool_label_disk: async '%s' (%llu)", leafname, (u_longlong_t)guid); return; /* resumes at EC_DEV_ADD.ESC_DISK for partition */ @@ -456,8 +501,8 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) } if (!found) { /* unexpected partition slice encountered */ - zed_log_msg(LOG_INFO, "labeled disk %s unexpected here", - fullpath); + zed_log_msg(LOG_WARNING, "labeled disk %s was " + "unexpected here", fullpath); (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, &newstate); return; @@ -466,10 +511,21 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) zed_log_msg(LOG_INFO, " zpool_label_disk: resume '%s' (%llu)", physpath, (u_longlong_t)guid); - (void) snprintf(devpath, sizeof (devpath), "%s%s", - DEV_BYID_PATH, new_devid); + /* + * Paths that begin with '/dev/disk/by-id/' will change and so + * they must be updated before calling zpool_vdev_attach(). + */ + if (strncmp(path, DEV_BYID_PATH, strlen(DEV_BYID_PATH)) == 0) { + (void) snprintf(pathbuf, sizeof (pathbuf), "%s%s", + DEV_BYID_PATH, new_devid); + zed_log_msg(LOG_INFO, " zpool_label_disk: path '%s' " + "replaced by '%s'", path, pathbuf); + path = pathbuf; + } } + libzfs_free_str_array(lines, lines_cnt); + /* * Construct the root vdev to pass to zpool_vdev_attach(). While adding * the entire vdev structure is harmless, we construct a reduced set of @@ -508,9 +564,11 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) * Wait for udev to verify the links exist, then auto-replace * the leaf disk at same physical location. */ - if (zpool_label_disk_wait(path, 3000) != 0) { - zed_log_msg(LOG_WARNING, "zfs_mod: expected replacement " - "disk %s is missing", path); + if (zpool_label_disk_wait(path, DISK_LABEL_WAIT) != 0) { + zed_log_msg(LOG_WARNING, "zfs_mod: pool '%s', after labeling " + "replacement disk, the expected disk partition link '%s' " + "is missing after waiting %u ms", + zpool_get_name(zhp), path, DISK_LABEL_WAIT); nvlist_free(nvroot); return; } @@ -525,7 +583,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) B_TRUE, B_FALSE); } - zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)", + zed_log_msg(LOG_WARNING, " zpool_vdev_replace: %s with %s (%s)", fullpath, path, (ret == 0) ? "no errors" : libzfs_error_description(g_zfshdl)); @@ -623,7 +681,7 @@ zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data) dp->dd_prop, path); dp->dd_found = B_TRUE; - /* pass the new devid for use by replacing code */ + /* pass the new devid for use by auto-replacing code */ if (dp->dd_new_devid != NULL) { (void) nvlist_add_string(nvl, "new_devid", dp->dd_new_devid); diff --git a/cmd/zpool/compatibility.d/openzfsonosx-1.8.1 b/cmd/zpool/compatibility.d/openzfsonosx-1.8.1 index 162ff32a78..125c578344 100644 --- a/cmd/zpool/compatibility.d/openzfsonosx-1.8.1 +++ b/cmd/zpool/compatibility.d/openzfsonosx-1.8.1 @@ -6,7 +6,6 @@ edonr embedded_data empty_bpobj enabled_txg -encryption extensible_dataset filesystem_limits hole_birth diff --git a/cmd/zpool/zpool_iter.c b/cmd/zpool/zpool_iter.c index 7c6549b0ae..506b529dce 100644 --- a/cmd/zpool/zpool_iter.c +++ b/cmd/zpool/zpool_iter.c @@ -443,37 +443,22 @@ vdev_run_cmd(vdev_cmd_data_t *data, char *cmd) { int rc; char *argv[2] = {cmd}; - char *env[5] = {(char *)"PATH=/bin:/sbin:/usr/bin:/usr/sbin"}; + char **env; char **lines = NULL; int lines_cnt = 0; int i; - /* Setup our custom environment variables */ - rc = asprintf(&env[1], "VDEV_PATH=%s", - data->path ? data->path : ""); - if (rc == -1) { - env[1] = NULL; + env = zpool_vdev_script_alloc_env(data->pool, data->path, data->upath, + data->vdev_enc_sysfs_path, NULL, NULL); + if (env == NULL) goto out; - } - - rc = asprintf(&env[2], "VDEV_UPATH=%s", - data->upath ? data->upath : ""); - if (rc == -1) { - env[2] = NULL; - goto out; - } - - rc = asprintf(&env[3], "VDEV_ENC_SYSFS_PATH=%s", - data->vdev_enc_sysfs_path ? - data->vdev_enc_sysfs_path : ""); - if (rc == -1) { - env[3] = NULL; - goto out; - } /* Run the command */ rc = libzfs_run_process_get_stdout_nopath(cmd, argv, env, &lines, &lines_cnt); + + zpool_vdev_script_free_env(env); + if (rc != 0) goto out; @@ -485,10 +470,6 @@ vdev_run_cmd(vdev_cmd_data_t *data, char *cmd) out: if (lines != NULL) libzfs_free_str_array(lines, lines_cnt); - - /* Start with i = 1 since env[0] was statically allocated */ - for (i = 1; i < ARRAY_SIZE(env); i++) - free(env[i]); } /* diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index d64fdfa5ba..5507f9d3fd 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -3122,12 +3122,21 @@ zfs_force_import_required(nvlist_t *config) nvlist_t *nvinfo; state = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE); - (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid); + nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); + + /* + * The hostid on LOAD_INFO comes from the MOS label via + * spa_tryimport(). If its not there then we're likely talking to an + * older kernel, so use the top one, which will be from the label + * discovered in zpool_find_import(), or if a cachefile is in use, the + * local hostid. + */ + if (nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_HOSTID, &hostid) != 0) + nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid); if (state != POOL_STATE_EXPORTED && hostid != get_system_hostid()) return (B_TRUE); - nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE)) { mmp_state_t mmp_state = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_STATE); @@ -3198,7 +3207,10 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, time_t timestamp = 0; uint64_t hostid = 0; - if (nvlist_exists(config, ZPOOL_CONFIG_HOSTNAME)) + if (nvlist_exists(nvinfo, ZPOOL_CONFIG_HOSTNAME)) + hostname = fnvlist_lookup_string(nvinfo, + ZPOOL_CONFIG_HOSTNAME); + else if (nvlist_exists(config, ZPOOL_CONFIG_HOSTNAME)) hostname = fnvlist_lookup_string(config, ZPOOL_CONFIG_HOSTNAME); @@ -3206,7 +3218,10 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, timestamp = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP); - if (nvlist_exists(config, ZPOOL_CONFIG_HOSTID)) + if (nvlist_exists(nvinfo, ZPOOL_CONFIG_HOSTID)) + hostid = fnvlist_lookup_uint64(nvinfo, + ZPOOL_CONFIG_HOSTID); + else if (nvlist_exists(config, ZPOOL_CONFIG_HOSTID)) hostid = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID); diff --git a/cmd/zpool/zpool_util.h b/cmd/zpool/zpool_util.h index b35dea0cd4..db8e631dc6 100644 --- a/cmd/zpool/zpool_util.h +++ b/cmd/zpool/zpool_util.h @@ -126,6 +126,10 @@ vdev_cmd_data_list_t *all_pools_for_each_vdev_run(int argc, char **argv, void free_vdev_cmd_data_list(vdev_cmd_data_list_t *vcdl); +void free_vdev_cmd_data(vdev_cmd_data_t *data); + +int vdev_run_cmd_simple(char *path, char *cmd); + int check_device(const char *path, boolean_t force, boolean_t isspare, boolean_t iswholedisk); boolean_t check_sector_size_database(char *path, int *sector_size); diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index b17d5e1f8e..231f86cbe8 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -936,6 +936,15 @@ zero_label(const char *path) return (0); } +static void +lines_to_stderr(char *lines[], int lines_cnt) +{ + int i; + for (i = 0; i < lines_cnt; i++) { + fprintf(stderr, "%s\n", lines[i]); + } +} + /* * Go through and find any whole disks in the vdev specification, labelling them * as appropriate. When constructing the vdev spec, we were unable to open this @@ -947,7 +956,7 @@ zero_label(const char *path) * need to get the devid after we label the disk. */ static int -make_disks(zpool_handle_t *zhp, nvlist_t *nv) +make_disks(zpool_handle_t *zhp, nvlist_t *nv, boolean_t replacing) { nvlist_t **child; uint_t c, children; @@ -1036,6 +1045,8 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) */ if (!is_exclusive && !is_spare(NULL, udevpath)) { char *devnode = strrchr(devpath, '/') + 1; + char **lines = NULL; + int lines_cnt = 0; ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT)); if (ret == 0) { @@ -1047,9 +1058,27 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) /* * When labeling a pool the raw device node name * is provided as it appears under /dev/. + * + * Note that 'zhp' will be NULL when we're creating a + * pool. */ - if (zpool_label_disk(g_zfs, zhp, devnode) == -1) + if (zpool_prepare_and_label_disk(g_zfs, zhp, devnode, + nv, zhp == NULL ? "create" : + replacing ? "replace" : "add", &lines, + &lines_cnt) != 0) { + (void) fprintf(stderr, + gettext( + "Error preparing/labeling disk.\n")); + if (lines_cnt > 0) { + (void) fprintf(stderr, + gettext("zfs_prepare_disk output:\n")); + lines_to_stderr(lines, lines_cnt); + } + + libzfs_free_str_array(lines, lines_cnt); return (-1); + } + libzfs_free_str_array(lines, lines_cnt); /* * Wait for udev to signal the device is available @@ -1086,19 +1115,19 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) } for (c = 0; c < children; c++) - if ((ret = make_disks(zhp, child[c])) != 0) + if ((ret = make_disks(zhp, child[c], replacing)) != 0) return (ret); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) for (c = 0; c < children; c++) - if ((ret = make_disks(zhp, child[c])) != 0) + if ((ret = make_disks(zhp, child[c], replacing)) != 0) return (ret); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) for (c = 0; c < children; c++) - if ((ret = make_disks(zhp, child[c])) != 0) + if ((ret = make_disks(zhp, child[c], replacing)) != 0) return (ret); return (0); @@ -1758,7 +1787,7 @@ split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props, return (NULL); } - if (!flags.dryrun && make_disks(zhp, newroot) != 0) { + if (!flags.dryrun && make_disks(zhp, newroot, B_FALSE) != 0) { nvlist_free(newroot); return (NULL); } @@ -1879,7 +1908,7 @@ make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep, /* * Run through the vdev specification and label any whole disks found. */ - if (!dryrun && make_disks(zhp, newroot) != 0) { + if (!dryrun && make_disks(zhp, newroot, replacing) != 0) { nvlist_free(newroot); return (NULL); } diff --git a/config/Rules.am b/config/Rules.am index 4dba438e12..92d51e70c2 100644 --- a/config/Rules.am +++ b/config/Rules.am @@ -42,6 +42,7 @@ AM_CPPFLAGS += -D_REENTRANT AM_CPPFLAGS += -D_FILE_OFFSET_BITS=64 AM_CPPFLAGS += -D_LARGEFILE64_SOURCE AM_CPPFLAGS += -DLIBEXECDIR=\"$(libexecdir)\" +AM_CPPFLAGS += -DZFSEXECDIR=\"$(zfsexecdir)\" AM_CPPFLAGS += -DRUNSTATEDIR=\"$(runstatedir)\" AM_CPPFLAGS += -DSBINDIR=\"$(sbindir)\" AM_CPPFLAGS += -DSYSCONFDIR=\"$(sysconfdir)\" diff --git a/config/kernel-fsync-bdev.m4 b/config/kernel-fsync-bdev.m4 new file mode 100644 index 0000000000..c47e236f70 --- /dev/null +++ b/config/kernel-fsync-bdev.m4 @@ -0,0 +1,36 @@ +dnl # +dnl # 6.6 API change, +dnl # fsync_bdev was removed in favor of sync_blockdev +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_SYNC_BDEV], [ + ZFS_LINUX_TEST_SRC([fsync_bdev], [ + #include + ],[ + fsync_bdev(NULL); + ]) + + ZFS_LINUX_TEST_SRC([sync_blockdev], [ + #include + ],[ + sync_blockdev(NULL); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SYNC_BDEV], [ + AC_MSG_CHECKING([whether fsync_bdev() exists]) + ZFS_LINUX_TEST_RESULT([fsync_bdev], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_FSYNC_BDEV, 1, + [fsync_bdev() is declared in include/blkdev.h]) + ],[ + AC_MSG_CHECKING([whether sync_blockdev() exists]) + ZFS_LINUX_TEST_RESULT([sync_blockdev], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SYNC_BLOCKDEV, 1, + [sync_blockdev() is declared in include/blkdev.h]) + ],[ + ZFS_LINUX_TEST_ERROR( + [neither fsync_bdev() nor sync_blockdev() exist]) + ]) + ]) +]) diff --git a/config/kernel-generic_fillattr.m4 b/config/kernel-generic_fillattr.m4 index 02dee4d4c0..f5323f0dcb 100644 --- a/config/kernel-generic_fillattr.m4 +++ b/config/kernel-generic_fillattr.m4 @@ -7,6 +7,10 @@ dnl # dnl # 6.3 API dnl # generic_fillattr() now takes struct mnt_idmap* as the first argument dnl # +dnl # 6.6 API +dnl # generic_fillattr() now takes u32 as second argument, representing a +dnl # request_mask for statx +dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_GENERIC_FILLATTR], [ ZFS_LINUX_TEST_SRC([generic_fillattr_userns], [ #include @@ -25,22 +29,39 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_GENERIC_FILLATTR], [ struct kstat *k = NULL; generic_fillattr(idmap, in, k); ]) + + ZFS_LINUX_TEST_SRC([generic_fillattr_mnt_idmap_reqmask], [ + #include + ],[ + struct mnt_idmap *idmap = NULL; + struct inode *in = NULL; + struct kstat *k = NULL; + generic_fillattr(idmap, 0, in, k); + ]) ]) AC_DEFUN([ZFS_AC_KERNEL_GENERIC_FILLATTR], [ - AC_MSG_CHECKING([whether generic_fillattr requires struct mnt_idmap*]) - ZFS_LINUX_TEST_RESULT([generic_fillattr_mnt_idmap], [ + AC_MSG_CHECKING( + [whether generic_fillattr requires struct mnt_idmap* and request_mask]) + ZFS_LINUX_TEST_RESULT([generic_fillattr_mnt_idmap_reqmask], [ AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_GENERIC_FILLATTR_IDMAP, 1, - [generic_fillattr requires struct mnt_idmap*]) + AC_DEFINE(HAVE_GENERIC_FILLATTR_IDMAP_REQMASK, 1, + [generic_fillattr requires struct mnt_idmap* and u32 request_mask]) ],[ - AC_MSG_CHECKING([whether generic_fillattr requires struct user_namespace*]) - ZFS_LINUX_TEST_RESULT([generic_fillattr_userns], [ + AC_MSG_CHECKING([whether generic_fillattr requires struct mnt_idmap*]) + ZFS_LINUX_TEST_RESULT([generic_fillattr_mnt_idmap], [ AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_GENERIC_FILLATTR_USERNS, 1, - [generic_fillattr requires struct user_namespace*]) + AC_DEFINE(HAVE_GENERIC_FILLATTR_IDMAP, 1, + [generic_fillattr requires struct mnt_idmap*]) ],[ - AC_MSG_RESULT([no]) + AC_MSG_CHECKING([whether generic_fillattr requires struct user_namespace*]) + ZFS_LINUX_TEST_RESULT([generic_fillattr_userns], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_GENERIC_FILLATTR_USERNS, 1, + [generic_fillattr requires struct user_namespace*]) + ],[ + AC_MSG_RESULT([no]) + ]) ]) ]) ]) diff --git a/config/kernel-inode-times.m4 b/config/kernel-inode-times.m4 index 9c016c7900..412e13b47d 100644 --- a/config/kernel-inode-times.m4 +++ b/config/kernel-inode-times.m4 @@ -27,6 +27,31 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_TIMES], [ memset(&ip, 0, sizeof(ip)); ts = ip.i_mtime; ]) + + dnl # + dnl # 6.6 API change + dnl # i_ctime no longer directly accessible, must use + dnl # inode_get_ctime(ip), inode_set_ctime*(ip) to + dnl # read/write. + dnl # + ZFS_LINUX_TEST_SRC([inode_get_ctime], [ + #include + ],[ + struct inode ip; + + memset(&ip, 0, sizeof(ip)); + inode_get_ctime(&ip); + ]) + + ZFS_LINUX_TEST_SRC([inode_set_ctime_to_ts], [ + #include + ],[ + struct inode ip; + struct timespec64 ts; + + memset(&ip, 0, sizeof(ip)); + inode_set_ctime_to_ts(&ip, ts); + ]) ]) AC_DEFUN([ZFS_AC_KERNEL_INODE_TIMES], [ @@ -47,4 +72,22 @@ AC_DEFUN([ZFS_AC_KERNEL_INODE_TIMES], [ AC_DEFINE(HAVE_INODE_TIMESPEC64_TIMES, 1, [inode->i_*time's are timespec64]) ]) + + AC_MSG_CHECKING([whether inode_get_ctime() exists]) + ZFS_LINUX_TEST_RESULT([inode_get_ctime], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_GET_CTIME, 1, + [inode_get_ctime() exists in linux/fs.h]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([whether inode_set_ctime_to_ts() exists]) + ZFS_LINUX_TEST_RESULT([inode_set_ctime_to_ts], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_SET_CTIME_TO_TS, 1, + [inode_set_ctime_to_ts() exists in linux/fs.h]) + ],[ + AC_MSG_RESULT(no) + ]) ]) diff --git a/config/kernel.m4 b/config/kernel.m4 index df194ec722..056517a841 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -162,6 +162,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_RECLAIMED ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ + ZFS_AC_KERNEL_SRC_SYNC_BDEV case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE @@ -303,6 +304,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_RECLAIMED ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE ZFS_AC_KERNEL_COPY_SPLICE_READ + ZFS_AC_KERNEL_SYNC_BDEV case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_CPU_HAS_FEATURE diff --git a/config/zfs-build.m4 b/config/zfs-build.m4 index 2703e6c016..5f36569fe2 100644 --- a/config/zfs-build.m4 +++ b/config/zfs-build.m4 @@ -358,6 +358,9 @@ AC_DEFUN([ZFS_AC_RPM], [ AS_IF([test -n "$udevruledir" ], [ RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_udevruledir $(udevruledir)"' ]) + AS_IF([test -n "$bashcompletiondir" ], [ + RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_bashcompletiondir $(bashcompletiondir)"' + ]) RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_SYSTEMD)' RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PYZFS)' RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PAM)' @@ -617,6 +620,18 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ AC_MSG_RESULT([no]) fi AC_SUBST(RPM_DEFINE_INITRAMFS) + + AC_MSG_CHECKING([default bash completion directory]) + case "$VENDOR" in + ubuntu) bashcompletiondir=/usr/share/bash-completion/completions ;; + debian) bashcompletiondir=/usr/share/bash-completion/completions ;; + freebsd) bashcompletiondir=$sysconfdir/bash_completion.d;; + gentoo) bashcompletiondir=/usr/share/bash-completion/completions ;; + *) bashcompletiondir=/etc/bash_completion.d ;; + esac + AC_MSG_RESULT([$bashcompletiondir]) + AC_SUBST(bashcompletiondir) + ]) dnl # diff --git a/contrib/bash_completion.d/Makefile.am b/contrib/bash_completion.d/Makefile.am index dc4b610c42..1ec05ed73d 100644 --- a/contrib/bash_completion.d/Makefile.am +++ b/contrib/bash_completion.d/Makefile.am @@ -1,5 +1,3 @@ -bashcompletiondir = $(sysconfdir)/bash_completion.d - nodist_bashcompletion_DATA = %D%/zfs SUBSTFILES += $(nodist_bashcompletion_DATA) diff --git a/contrib/debian/openzfs-zfsutils.install b/contrib/debian/openzfs-zfsutils.install index 301d8f67b3..741014398a 100644 --- a/contrib/debian/openzfs-zfsutils.install +++ b/contrib/debian/openzfs-zfsutils.install @@ -1,7 +1,6 @@ etc/default/zfs etc/zfs/zfs-functions etc/zfs/zpool.d/ -etc/bash_completion.d/zfs lib/systemd/system-generators/ lib/systemd/system-preset/ lib/systemd/system/zfs-import-cache.service @@ -35,6 +34,7 @@ usr/bin/zvol_wait usr/lib/modules-load.d/ lib/ usr/lib/zfs-linux/zpool.d/ usr/lib/zfs-linux/zpool_influxdb +usr/lib/zfs-linux/zfs_prepare_disk usr/sbin/arc_summary usr/sbin/arcstat usr/sbin/dbufstat @@ -88,6 +88,7 @@ usr/share/man/man8/zfs-wait.8 usr/share/man/man8/zfs-zone.8 usr/share/man/man8/zfs.8 usr/share/man/man8/zfs_ids_to_path.8 +usr/share/man/man8/zfs_prepare_disk.8 usr/share/man/man7/zfsconcepts.7 usr/share/man/man7/zfsprops.7 usr/share/man/man8/zgenhostid.8 diff --git a/contrib/debian/rules.in b/contrib/debian/rules.in index e8cb5aa0ce..38c7038a70 100755 --- a/contrib/debian/rules.in +++ b/contrib/debian/rules.in @@ -71,10 +71,6 @@ override_dh_auto_install: @# Install the utilities. $(MAKE) install DESTDIR='$(CURDIR)/debian/tmp' - # Use upstream's bash completion - install -D -t '$(CURDIR)/debian/tmp/usr/share/bash-completion/completions/' \ - '$(CURDIR)/contrib/bash_completion.d/zfs' - # Move from bin_dir to /usr/sbin # Remove suffix (.py) as per policy 10.4 - Scripts # https://www.debian.org/doc/debian-policy/ch-files.html#s-scripts @@ -136,7 +132,6 @@ override_dh_auto_install: chmod a-x '$(CURDIR)/debian/tmp/etc/zfs/zfs-functions' chmod a-x '$(CURDIR)/debian/tmp/etc/default/zfs' - chmod a-x '$(CURDIR)/debian/tmp/usr/share/bash-completion/completions/zfs' override_dh_python3: dh_python3 -p openzfs-python3-pyzfs diff --git a/include/libzfs.h b/include/libzfs.h index 6c36692737..4adfa38e87 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -326,6 +326,15 @@ _LIBZFS_H nvlist_t *zpool_find_vdev_by_physpath(zpool_handle_t *, const char *, boolean_t *, boolean_t *, boolean_t *); _LIBZFS_H int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, const char *); +_LIBZFS_H int zpool_prepare_disk(zpool_handle_t *zhp, nvlist_t *vdev_nv, + const char *prepare_str, char **lines[], int *lines_cnt); +_LIBZFS_H int zpool_prepare_and_label_disk(libzfs_handle_t *hdl, + zpool_handle_t *, const char *, nvlist_t *vdev_nv, const char *prepare_str, + char **lines[], int *lines_cnt); +_LIBZFS_H char ** zpool_vdev_script_alloc_env(const char *pool_name, + const char *vdev_path, const char *vdev_upath, + const char *vdev_enc_sysfs_path, const char *opt_key, const char *opt_val); +_LIBZFS_H void zpool_vdev_script_free_env(char **env); _LIBZFS_H uint64_t zpool_vdev_path_to_guid(zpool_handle_t *zhp, const char *path); diff --git a/include/libzutil.h b/include/libzutil.h index 237ff976ba..053b1ed4b5 100644 --- a/include/libzutil.h +++ b/include/libzutil.h @@ -34,7 +34,7 @@ extern "C" { #endif /* - * Default wait time for a device name to be created. + * Default wait time in milliseconds for a device name to be created. */ #define DISK_LABEL_WAIT (30 * 1000) /* 30 seconds */ diff --git a/include/os/freebsd/spl/sys/atomic.h b/include/os/freebsd/spl/sys/atomic.h index 8b9cec15c5..40a67704fd 100644 --- a/include/os/freebsd/spl/sys/atomic.h +++ b/include/os/freebsd/spl/sys/atomic.h @@ -167,7 +167,7 @@ atomic_dec_64_nv(volatile uint64_t *target) return (atomic_add_64_nv(target, -1)); } -#if !defined(COMPAT_32BIT) && defined(__LP64__) +#ifdef __LP64__ static __inline void * atomic_cas_ptr(volatile void *target, void *cmp, void *newval) { @@ -181,7 +181,7 @@ atomic_cas_ptr(volatile void *target, void *cmp, void *newval) return ((void *)atomic_cas_32((volatile uint32_t *)target, (uint32_t)cmp, (uint32_t)newval)); } -#endif /* !defined(COMPAT_32BIT) && defined(__LP64__) */ +#endif /* __LP64__ */ #else /* _STANDALONE */ /* @@ -190,6 +190,8 @@ atomic_cas_ptr(volatile void *target, void *cmp, void *newval) */ #undef atomic_add_64 #define atomic_add_64(ptr, val) *(ptr) += val +#undef atomic_sub_64 +#define atomic_sub_64(ptr, val) *(ptr) -= val #endif /* !_STANDALONE */ #endif /* !_OPENSOLARIS_SYS_ATOMIC_H_ */ diff --git a/include/os/freebsd/spl/sys/ccompat.h b/include/os/freebsd/spl/sys/ccompat.h index eaee9159ea..e34bab7e89 100644 --- a/include/os/freebsd/spl/sys/ccompat.h +++ b/include/os/freebsd/spl/sys/ccompat.h @@ -1,5 +1,5 @@ /* - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * SPDX-License-Identifier: BSD-2-Clause * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions diff --git a/include/os/freebsd/spl/sys/ctype.h b/include/os/freebsd/spl/sys/ctype.h index 53afd8b8bd..b9ca0f8ef2 100644 --- a/include/os/freebsd/spl/sys/ctype.h +++ b/include/os/freebsd/spl/sys/ctype.h @@ -39,5 +39,6 @@ ((C) >= 0x3A && (C) <= 0x40) || \ ((C) >= 0x5B && (C) <= 0x60) || \ ((C) >= 0x7B && (C) <= 0x7E)) +#define isspace(C) ((C) == 0x20 || ((C) >= 0x9 && (C) <= 0xD)) #endif diff --git a/include/os/freebsd/spl/sys/mutex.h b/include/os/freebsd/spl/sys/mutex.h index e757d12c15..8cfe56c753 100644 --- a/include/os/freebsd/spl/sys/mutex.h +++ b/include/os/freebsd/spl/sys/mutex.h @@ -64,6 +64,7 @@ typedef enum { } while (0) #define mutex_destroy(lock) sx_destroy(lock) #define mutex_enter(lock) sx_xlock(lock) +#define mutex_enter_interruptible(lock) sx_xlock_sig(lock) #define mutex_enter_nested(lock, type) sx_xlock(lock) #define mutex_tryenter(lock) sx_try_xlock(lock) #define mutex_exit(lock) sx_xunlock(lock) diff --git a/include/os/freebsd/spl/sys/spl_condvar.h b/include/os/freebsd/spl/sys/spl_condvar.h index 7405f647d5..2835adafd4 100644 --- a/include/os/freebsd/spl/sys/spl_condvar.h +++ b/include/os/freebsd/spl/sys/spl_condvar.h @@ -1,5 +1,5 @@ /* - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2000 Jake Burkholder . * All rights reserved. diff --git a/include/os/freebsd/spl/sys/taskq.h b/include/os/freebsd/spl/sys/taskq.h index 30579b3917..b23a939b3a 100644 --- a/include/os/freebsd/spl/sys/taskq.h +++ b/include/os/freebsd/spl/sys/taskq.h @@ -30,9 +30,9 @@ #include #include +#include #include #include -#include #ifdef __cplusplus extern "C" { @@ -48,16 +48,16 @@ typedef uintptr_t taskqid_t; typedef void (task_func_t)(void *); typedef struct taskq_ent { - struct task tqent_task; - struct timeout_task tqent_timeout_task; + union { + struct task tqent_task; + struct timeout_task tqent_timeout_task; + }; task_func_t *tqent_func; void *tqent_arg; - taskqid_t tqent_id; - CK_LIST_ENTRY(taskq_ent) tqent_hash; - uint8_t tqent_type; - uint8_t tqent_registered; - uint8_t tqent_cancelled; - volatile uint32_t tqent_rc; + taskqid_t tqent_id; + LIST_ENTRY(taskq_ent) tqent_hash; + uint_t tqent_type; + volatile uint_t tqent_rc; } taskq_ent_t; /* diff --git a/include/os/freebsd/zfs/sys/arc_os.h b/include/os/freebsd/zfs/sys/arc_os.h index a95618b91f..ad2aba23b9 100644 --- a/include/os/freebsd/zfs/sys/arc_os.h +++ b/include/os/freebsd/zfs/sys/arc_os.h @@ -1,5 +1,5 @@ /* - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022 Martin Matuska * diff --git a/include/os/freebsd/zfs/sys/freebsd_event.h b/include/os/freebsd/zfs/sys/freebsd_event.h index 544ff8b0f8..a32596d918 100644 --- a/include/os/freebsd/zfs/sys/freebsd_event.h +++ b/include/os/freebsd/zfs/sys/freebsd_event.h @@ -1,5 +1,5 @@ /* - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022 Rob Wing * diff --git a/include/os/freebsd/zfs/sys/zfs_context_os.h b/include/os/freebsd/zfs/sys/zfs_context_os.h index 1ce7233041..457fa3af81 100644 --- a/include/os/freebsd/zfs/sys/zfs_context_os.h +++ b/include/os/freebsd/zfs/sys/zfs_context_os.h @@ -78,7 +78,7 @@ extern int hz; extern int tick; typedef int fstrans_cookie_t; #define spl_fstrans_mark() (0) -#define spl_fstrans_unmark(x) (x = 0) +#define spl_fstrans_unmark(x) ((void)x) #define signal_pending(x) SIGPENDING(x) #define current curthread #define thread_join(x) diff --git a/include/os/freebsd/zfs/sys/zfs_vfsops_os.h b/include/os/freebsd/zfs/sys/zfs_vfsops_os.h index 24bb03575f..56a0ac96ac 100644 --- a/include/os/freebsd/zfs/sys/zfs_vfsops_os.h +++ b/include/os/freebsd/zfs/sys/zfs_vfsops_os.h @@ -286,6 +286,7 @@ typedef struct zfid_long { extern uint_t zfs_fsyncer_key; extern int zfs_super_owner; +extern int zfs_bclone_enabled; extern void zfs_init(void); extern void zfs_fini(void); diff --git a/include/os/linux/kernel/linux/vfs_compat.h b/include/os/linux/kernel/linux/vfs_compat.h index e156ed41c2..aea8bd5ed2 100644 --- a/include/os/linux/kernel/linux/vfs_compat.h +++ b/include/os/linux/kernel/linux/vfs_compat.h @@ -461,10 +461,16 @@ zpl_is_32bit_api(void) * 6.3 API change * generic_fillattr() first arg is changed to struct mnt_idmap * * + * 6.6 API change + * generic_fillattr() gets new second arg request_mask, a u32 type + * */ #ifdef HAVE_GENERIC_FILLATTR_IDMAP #define zpl_generic_fillattr(idmap, ip, sp) \ generic_fillattr(idmap, ip, sp) +#elif defined(HAVE_GENERIC_FILLATTR_IDMAP_REQMASK) +#define zpl_generic_fillattr(idmap, rqm, ip, sp) \ + generic_fillattr(idmap, rqm, ip, sp) #elif defined(HAVE_GENERIC_FILLATTR_USERNS) #define zpl_generic_fillattr(user_ns, ip, sp) \ generic_fillattr(user_ns, ip, sp) diff --git a/include/os/linux/spl/sys/kmem_cache.h b/include/os/linux/spl/sys/kmem_cache.h index 20eeadc46e..82d50b6034 100644 --- a/include/os/linux/spl/sys/kmem_cache.h +++ b/include/os/linux/spl/sys/kmem_cache.h @@ -108,7 +108,7 @@ typedef struct spl_kmem_magazine { uint32_t skm_refill; /* Batch refill size */ struct spl_kmem_cache *skm_cache; /* Owned by cache */ unsigned int skm_cpu; /* Owned by cpu */ - void *skm_objs[0]; /* Object pointers */ + void *skm_objs[]; /* Object pointers */ } spl_kmem_magazine_t; typedef struct spl_kmem_obj { diff --git a/include/os/linux/spl/sys/mutex.h b/include/os/linux/spl/sys/mutex.h index 6b61c59c48..b4eaa0266d 100644 --- a/include/os/linux/spl/sys/mutex.h +++ b/include/os/linux/spl/sys/mutex.h @@ -128,7 +128,6 @@ spl_mutex_lockdep_on_maybe(kmutex_t *mp) \ #define NESTED_SINGLE 1 -#ifdef CONFIG_DEBUG_LOCK_ALLOC #define mutex_enter_nested(mp, subclass) \ { \ ASSERT3P(mutex_owner(mp), !=, current); \ @@ -137,16 +136,22 @@ spl_mutex_lockdep_on_maybe(kmutex_t *mp) \ spl_mutex_lockdep_on_maybe(mp); \ spl_mutex_set_owner(mp); \ } -#else /* CONFIG_DEBUG_LOCK_ALLOC */ -#define mutex_enter_nested(mp, subclass) \ -{ \ + +#define mutex_enter_interruptible(mp) \ +/* CSTYLED */ \ +({ \ + int _rc_; \ + \ ASSERT3P(mutex_owner(mp), !=, current); \ spl_mutex_lockdep_off_maybe(mp); \ - mutex_lock(MUTEX(mp)); \ + _rc_ = mutex_lock_interruptible(MUTEX(mp)); \ spl_mutex_lockdep_on_maybe(mp); \ - spl_mutex_set_owner(mp); \ -} -#endif /* CONFIG_DEBUG_LOCK_ALLOC */ + if (!_rc_) { \ + spl_mutex_set_owner(mp); \ + } \ + \ + _rc_; \ +}) #define mutex_enter(mp) mutex_enter_nested((mp), 0) diff --git a/include/os/linux/spl/sys/uio.h b/include/os/linux/spl/sys/uio.h index cce097e16f..a4b600004c 100644 --- a/include/os/linux/spl/sys/uio.h +++ b/include/os/linux/spl/sys/uio.h @@ -73,13 +73,6 @@ typedef struct zfs_uio { size_t uio_skip; struct request *rq; - - /* - * Used for saving rq_for_each_segment() state between calls - * to zfs_uiomove_bvec_rq(). - */ - struct req_iterator iter; - struct bio_vec bv; } zfs_uio_t; @@ -138,7 +131,6 @@ zfs_uio_bvec_init(zfs_uio_t *uio, struct bio *bio, struct request *rq) } else { uio->uio_bvec = NULL; uio->uio_iovcnt = 0; - memset(&uio->iter, 0, sizeof (uio->iter)); } uio->uio_loffset = io_offset(bio, rq); diff --git a/include/os/linux/zfs/sys/trace_arc.h b/include/os/linux/zfs/sys/trace_arc.h index c494f48bb4..f749223daa 100644 --- a/include/os/linux/zfs/sys/trace_arc.h +++ b/include/os/linux/zfs/sys/trace_arc.h @@ -51,7 +51,6 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class, __array(uint64_t, hdr_dva_word, 2) __field(uint64_t, hdr_birth) __field(uint32_t, hdr_flags) - __field(uint32_t, hdr_bufcnt) __field(arc_buf_contents_t, hdr_type) __field(uint16_t, hdr_psize) __field(uint16_t, hdr_lsize) @@ -70,7 +69,6 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class, __entry->hdr_dva_word[1] = ab->b_dva.dva_word[1]; __entry->hdr_birth = ab->b_birth; __entry->hdr_flags = ab->b_flags; - __entry->hdr_bufcnt = ab->b_l1hdr.b_bufcnt; __entry->hdr_psize = ab->b_psize; __entry->hdr_lsize = ab->b_lsize; __entry->hdr_spa = ab->b_spa; @@ -84,12 +82,12 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class, __entry->hdr_refcount = ab->b_l1hdr.b_refcnt.rc_count; ), TP_printk("hdr { dva 0x%llx:0x%llx birth %llu " - "flags 0x%x bufcnt %u type %u psize %u lsize %u spa %llu " + "flags 0x%x type %u psize %u lsize %u spa %llu " "state_type %u access %lu mru_hits %u mru_ghost_hits %u " "mfu_hits %u mfu_ghost_hits %u l2_hits %u refcount %lli }", __entry->hdr_dva_word[0], __entry->hdr_dva_word[1], __entry->hdr_birth, __entry->hdr_flags, - __entry->hdr_bufcnt, __entry->hdr_type, __entry->hdr_psize, + __entry->hdr_type, __entry->hdr_psize, __entry->hdr_lsize, __entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access, __entry->hdr_mru_hits, __entry->hdr_mru_ghost_hits, __entry->hdr_mfu_hits, @@ -192,7 +190,6 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, __array(uint64_t, hdr_dva_word, 2) __field(uint64_t, hdr_birth) __field(uint32_t, hdr_flags) - __field(uint32_t, hdr_bufcnt) __field(arc_buf_contents_t, hdr_type) __field(uint16_t, hdr_psize) __field(uint16_t, hdr_lsize) @@ -223,7 +220,6 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, __entry->hdr_dva_word[1] = hdr->b_dva.dva_word[1]; __entry->hdr_birth = hdr->b_birth; __entry->hdr_flags = hdr->b_flags; - __entry->hdr_bufcnt = hdr->b_l1hdr.b_bufcnt; __entry->hdr_psize = hdr->b_psize; __entry->hdr_lsize = hdr->b_lsize; __entry->hdr_spa = hdr->b_spa; @@ -255,7 +251,7 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, __entry->zb_blkid = zb->zb_blkid; ), TP_printk("hdr { dva 0x%llx:0x%llx birth %llu " - "flags 0x%x bufcnt %u psize %u lsize %u spa %llu state_type %u " + "flags 0x%x psize %u lsize %u spa %llu state_type %u " "access %lu mru_hits %u mru_ghost_hits %u mfu_hits %u " "mfu_ghost_hits %u l2_hits %u refcount %lli } " "bp { dva0 0x%llx:0x%llx dva1 0x%llx:0x%llx dva2 " @@ -264,7 +260,7 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, "blkid %llu }", __entry->hdr_dva_word[0], __entry->hdr_dva_word[1], __entry->hdr_birth, __entry->hdr_flags, - __entry->hdr_bufcnt, __entry->hdr_psize, __entry->hdr_lsize, + __entry->hdr_psize, __entry->hdr_lsize, __entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access, __entry->hdr_mru_hits, __entry->hdr_mru_ghost_hits, __entry->hdr_mfu_hits, __entry->hdr_mfu_ghost_hits, diff --git a/include/os/linux/zfs/sys/trace_dbuf.h b/include/os/linux/zfs/sys/trace_dbuf.h index 11d25be35b..0f6a98b47d 100644 --- a/include/os/linux/zfs/sys/trace_dbuf.h +++ b/include/os/linux/zfs/sys/trace_dbuf.h @@ -60,8 +60,12 @@ #define DBUF_TP_FAST_ASSIGN \ if (db != NULL) { \ - __assign_str(os_spa, \ - spa_name(DB_DNODE(db)->dn_objset->os_spa)); \ + if (POINTER_IS_VALID(DB_DNODE(db)->dn_objset)) { \ + __assign_str(os_spa, \ + spa_name(DB_DNODE(db)->dn_objset->os_spa)); \ + } else { \ + __assign_str(os_spa, "NULL"); \ + } \ \ __entry->ds_object = db->db_objset->os_dsl_dataset ? \ db->db_objset->os_dsl_dataset->ds_object : 0; \ diff --git a/include/os/linux/zfs/sys/zfs_vfsops_os.h b/include/os/linux/zfs/sys/zfs_vfsops_os.h index b4d5db21f5..2204665502 100644 --- a/include/os/linux/zfs/sys/zfs_vfsops_os.h +++ b/include/os/linux/zfs/sys/zfs_vfsops_os.h @@ -45,6 +45,8 @@ extern "C" { typedef struct zfsvfs zfsvfs_t; struct znode; +extern int zfs_bclone_enabled; + /* * This structure emulates the vfs_t from other platforms. It's purpose * is to facilitate the handling of mount options and minimize structural diff --git a/include/os/linux/zfs/sys/zfs_vnops_os.h b/include/os/linux/zfs/sys/zfs_vnops_os.h index 7a1db7deee..830c76e574 100644 --- a/include/os/linux/zfs/sys/zfs_vnops_os.h +++ b/include/os/linux/zfs/sys/zfs_vnops_os.h @@ -56,7 +56,12 @@ extern int zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, extern int zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, int flags); extern int zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr); +#ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK +extern int zfs_getattr_fast(zidmap_t *, u32 request_mask, struct inode *ip, + struct kstat *sp); +#else extern int zfs_getattr_fast(zidmap_t *, struct inode *ip, struct kstat *sp); +#endif extern int zfs_setattr(znode_t *zp, vattr_t *vap, int flag, cred_t *cr, zidmap_t *mnt_ns); extern int zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, diff --git a/include/os/linux/zfs/sys/zpl.h b/include/os/linux/zfs/sys/zpl.h index 5d11c79101..707c6ddb11 100644 --- a/include/os/linux/zfs/sys/zpl.h +++ b/include/os/linux/zfs/sys/zpl.h @@ -60,7 +60,7 @@ extern const struct file_operations zpl_file_operations; extern const struct file_operations zpl_dir_file_operations; /* zpl_super.c */ -extern void zpl_prune_sb(int64_t nr_to_scan, void *arg); +extern void zpl_prune_sb(uint64_t nr_to_scan, void *arg); extern const struct super_operations zpl_super_operations; extern const struct export_operations zpl_export_operations; @@ -272,4 +272,15 @@ extern long zpl_ioctl_fideduperange(struct file *filp, void *arg); #define zpl_setattr_prepare(ns, dentry, ia) setattr_prepare(dentry, ia) #endif +#ifdef HAVE_INODE_GET_CTIME +#define zpl_inode_get_ctime(ip) inode_get_ctime(ip) +#else +#define zpl_inode_get_ctime(ip) (ip->i_ctime) +#endif +#ifdef HAVE_INODE_SET_CTIME_TO_TS +#define zpl_inode_set_ctime_to_ts(ip, ts) inode_set_ctime_to_ts(ip, ts) +#else +#define zpl_inode_set_ctime_to_ts(ip, ts) (ip->i_ctime = ts) +#endif + #endif /* _SYS_ZPL_H */ diff --git a/include/sys/arc.h b/include/sys/arc.h index 9d67dab06c..05307aab99 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -81,7 +81,7 @@ typedef struct arc_prune arc_prune_t; typedef void arc_read_done_func_t(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *priv); typedef void arc_write_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv); -typedef void arc_prune_func_t(int64_t bytes, void *priv); +typedef void arc_prune_func_t(uint64_t bytes, void *priv); /* Shared module parameters */ extern uint_t zfs_arc_average_blocksize; diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index 78774792f3..defebe3b2f 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -159,10 +159,6 @@ struct arc_write_callback { * these two allocation states. */ typedef struct l1arc_buf_hdr { - /* for waiting on reads to complete */ - kcondvar_t b_cv; - uint8_t b_byteswap; - /* protected by arc state mutex */ arc_state_t *b_state; multilist_node_t b_arc_node; @@ -173,7 +169,7 @@ typedef struct l1arc_buf_hdr { uint32_t b_mru_ghost_hits; uint32_t b_mfu_hits; uint32_t b_mfu_ghost_hits; - uint32_t b_bufcnt; + uint8_t b_byteswap; arc_buf_t *b_buf; /* self protecting */ @@ -436,12 +432,12 @@ typedef struct l2arc_dev { */ typedef struct arc_buf_hdr_crypt { abd_t *b_rabd; /* raw encrypted data */ - dmu_object_type_t b_ot; /* object type */ - uint32_t b_ebufcnt; /* count of encrypted buffers */ /* dsobj for looking up encryption key for l2arc encryption */ uint64_t b_dsobj; + dmu_object_type_t b_ot; /* object type */ + /* encryption parameters */ uint8_t b_salt[ZIO_DATA_SALT_LEN]; uint8_t b_iv[ZIO_DATA_IV_LEN]; @@ -1069,7 +1065,6 @@ extern void arc_wait_for_eviction(uint64_t, boolean_t); extern void arc_lowmem_init(void); extern void arc_lowmem_fini(void); -extern void arc_prune_async(uint64_t); extern int arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg); extern uint64_t arc_free_memory(void); extern int64_t arc_available_memory(void); diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index d328068890..4f434291dd 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -250,7 +250,6 @@ struct metaslab_group { int64_t mg_activation_count; metaslab_class_t *mg_class; vdev_t *mg_vd; - taskq_t *mg_taskq; metaslab_group_t *mg_prev; metaslab_group_t *mg_next; diff --git a/include/sys/spa.h b/include/sys/spa.h index b908556874..87ddbd90e1 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -837,7 +837,7 @@ extern kmutex_t spa_namespace_lock; extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t, boolean_t); extern void spa_config_load(void); -extern nvlist_t *spa_all_configs(uint64_t *); +extern int spa_all_configs(uint64_t *generation, nvlist_t **pools); extern void spa_config_set(spa_t *spa, nvlist_t *config); extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 588c72f6e4..cdf65c3713 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -423,7 +423,9 @@ struct spa { hrtime_t spa_ccw_fail_time; /* Conf cache write fail time */ taskq_t *spa_zvol_taskq; /* Taskq for minor management */ + taskq_t *spa_metaslab_taskq; /* Taskq for metaslab preload */ taskq_t *spa_prefetch_taskq; /* Taskq for prefetch threads */ + taskq_t *spa_upgrade_taskq; /* Taskq for upgrade jobs */ uint64_t spa_multihost; /* multihost aware (mmp) */ mmp_thread_t spa_mmp; /* multihost mmp thread */ list_t spa_leaf_list; /* list of leaf vdevs */ @@ -447,8 +449,6 @@ struct spa { */ spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */ zfs_refcount_t spa_refcount; /* number of opens */ - - taskq_t *spa_upgrade_taskq; /* taskq for upgrade jobs */ }; extern char *spa_config_path; diff --git a/include/sys/txg_impl.h b/include/sys/txg_impl.h index 45fde2e1f3..8ab7969b25 100644 --- a/include/sys/txg_impl.h +++ b/include/sys/txg_impl.h @@ -73,8 +73,7 @@ struct tx_cpu { kcondvar_t tc_cv[TXG_SIZE]; uint64_t tc_count[TXG_SIZE]; /* tx hold count on each txg */ list_t tc_callbacks[TXG_SIZE]; /* commit cb list */ - char tc_pad[8]; /* pad to fill 3 cache lines */ -}; +} ____cacheline_aligned; /* * The tx_state structure maintains the state information about the different diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index ad9dc3aefd..3f2312c234 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -131,7 +131,10 @@ typedef const struct vdev_ops { * Virtual device properties */ typedef union vdev_queue_class { - list_t vqc_list; + struct { + ulong_t vqc_list_numnodes; + list_t vqc_list; + }; avl_tree_t vqc_tree; } vdev_queue_class_t; diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h index c1037fa12e..73c26dff1e 100644 --- a/include/sys/vdev_raidz_impl.h +++ b/include/sys/vdev_raidz_impl.h @@ -130,7 +130,7 @@ typedef struct raidz_row { uint64_t rr_offset; /* Logical offset for *_io_verify() */ uint64_t rr_size; /* Physical size for *_io_verify() */ #endif - raidz_col_t rr_col[0]; /* Flexible array of I/O columns */ + raidz_col_t rr_col[]; /* Flexible array of I/O columns */ } raidz_row_t; typedef struct raidz_map { @@ -139,7 +139,7 @@ typedef struct raidz_map { int rm_nskip; /* RAIDZ sectors skipped for padding */ int rm_skipstart; /* Column index of padding start */ const raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */ - raidz_row_t *rm_row[0]; /* flexible array of rows */ + raidz_row_t *rm_row[]; /* flexible array of rows */ } raidz_map_t; diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 6a337b49ed..750ca612b9 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -274,11 +274,13 @@ typedef struct kmutex { extern void mutex_init(kmutex_t *mp, char *name, int type, void *cookie); extern void mutex_destroy(kmutex_t *mp); extern void mutex_enter(kmutex_t *mp); +extern int mutex_enter_check_return(kmutex_t *mp); extern void mutex_exit(kmutex_t *mp); extern int mutex_tryenter(kmutex_t *mp); #define NESTED_SINGLE 1 #define mutex_enter_nested(mp, class) mutex_enter(mp) +#define mutex_enter_interruptible(mp) mutex_enter_check_return(mp) /* * RW locks */ diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 8658d39e28..2d612a16b2 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -515,6 +515,8 @@ + + @@ -562,6 +564,8 @@ + + diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index b94abea3d5..fdd1975fa6 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -2071,3 +2071,196 @@ printf_color(const char *color, const char *format, ...) return (rc); } + +/* PATH + 5 env vars + a NULL entry = 7 */ +#define ZPOOL_VDEV_SCRIPT_ENV_COUNT 7 + +/* + * There's a few places where ZFS will call external scripts (like the script + * in zpool.d/ and `zfs_prepare_disk`). These scripts are called with a + * reduced $PATH, and some vdev specific environment vars set. This function + * will allocate an populate the environment variable array that is passed to + * these scripts. The user must free the arrays with zpool_vdev_free_env() when + * they are done. + * + * The following env vars will be set (but value could be blank): + * + * POOL_NAME + * VDEV_PATH + * VDEV_UPATH + * VDEV_ENC_SYSFS_PATH + * + * In addition, you can set an optional environment variable named 'opt_key' + * to 'opt_val' if you want. + * + * Returns allocated env[] array on success, NULL otherwise. + */ +char ** +zpool_vdev_script_alloc_env(const char *pool_name, + const char *vdev_path, const char *vdev_upath, + const char *vdev_enc_sysfs_path, const char *opt_key, const char *opt_val) +{ + char **env = NULL; + int rc; + + env = calloc(ZPOOL_VDEV_SCRIPT_ENV_COUNT, sizeof (*env)); + if (!env) + return (NULL); + + env[0] = strdup("PATH=/bin:/sbin:/usr/bin:/usr/sbin"); + if (!env[0]) + goto error; + + /* Setup our custom environment variables */ + rc = asprintf(&env[1], "POOL_NAME=%s", pool_name ? pool_name : ""); + if (rc == -1) { + env[1] = NULL; + goto error; + } + + rc = asprintf(&env[2], "VDEV_PATH=%s", vdev_path ? vdev_path : ""); + if (rc == -1) { + env[2] = NULL; + goto error; + } + + rc = asprintf(&env[3], "VDEV_UPATH=%s", vdev_upath ? vdev_upath : ""); + if (rc == -1) { + env[3] = NULL; + goto error; + } + + rc = asprintf(&env[4], "VDEV_ENC_SYSFS_PATH=%s", + vdev_enc_sysfs_path ? vdev_enc_sysfs_path : ""); + if (rc == -1) { + env[4] = NULL; + goto error; + } + + if (opt_key != NULL) { + rc = asprintf(&env[5], "%s=%s", opt_key, + opt_val ? opt_val : ""); + if (rc == -1) { + env[5] = NULL; + goto error; + } + } + + return (env); + +error: + for (int i = 0; i < ZPOOL_VDEV_SCRIPT_ENV_COUNT; i++) + free(env[i]); + + free(env); + + return (NULL); +} + +/* + * Free the env[] array that was allocated by zpool_vdev_script_alloc_env(). + */ +void +zpool_vdev_script_free_env(char **env) +{ + for (int i = 0; i < ZPOOL_VDEV_SCRIPT_ENV_COUNT; i++) + free(env[i]); + + free(env); +} + +/* + * Prepare a disk by (optionally) running a program before labeling the disk. + * This can be useful for installing disk firmware or doing some pre-flight + * checks on the disk before it becomes part of the pool. The program run is + * located at ZFSEXECDIR/zfs_prepare_disk + * (E.x: /usr/local/libexec/zfs/zfs_prepare_disk). + * + * Return 0 on success, non-zero on failure. + */ +int +zpool_prepare_disk(zpool_handle_t *zhp, nvlist_t *vdev_nv, + const char *prepare_str, char **lines[], int *lines_cnt) +{ + const char *script_path = ZFSEXECDIR "/zfs_prepare_disk"; + const char *pool_name; + int rc = 0; + + /* Path to script and a NULL entry */ + char *argv[2] = {(char *)script_path}; + char **env = NULL; + const char *path = NULL, *enc_sysfs_path = NULL; + char *upath; + *lines_cnt = 0; + + if (access(script_path, X_OK) != 0) { + /* No script, nothing to do */ + return (0); + } + + (void) nvlist_lookup_string(vdev_nv, ZPOOL_CONFIG_PATH, &path); + (void) nvlist_lookup_string(vdev_nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, + &enc_sysfs_path); + + upath = zfs_get_underlying_path(path); + pool_name = zhp ? zpool_get_name(zhp) : NULL; + + env = zpool_vdev_script_alloc_env(pool_name, path, upath, + enc_sysfs_path, "VDEV_PREPARE", prepare_str); + + free(upath); + + if (env == NULL) { + return (ENOMEM); + } + + rc = libzfs_run_process_get_stdout(script_path, argv, env, lines, + lines_cnt); + + zpool_vdev_script_free_env(env); + + return (rc); +} + +/* + * Optionally run a script and then label a disk. The script can be used to + * prepare a disk for inclusion into the pool. For example, it might update + * the disk's firmware or check its health. + * + * The 'name' provided is the short name, stripped of any leading + * /dev path, and is passed to zpool_label_disk. vdev_nv is the nvlist for + * the vdev. prepare_str is a string that gets passed as the VDEV_PREPARE + * env variable to the script. + * + * The following env vars are passed to the script: + * + * POOL_NAME: The pool name (blank during zpool create) + * VDEV_PREPARE: Reason why the disk is being prepared for inclusion: + * "create", "add", "replace", or "autoreplace" + * VDEV_PATH: Path to the disk + * VDEV_UPATH: One of the 'underlying paths' to the disk. This is + * useful for DM devices. + * VDEV_ENC_SYSFS_PATH: Path to the disk's enclosure sysfs path, if available. + * + * Note, some of these values can be blank. + * + * Return 0 on success, non-zero otherwise. + */ +int +zpool_prepare_and_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, + const char *name, nvlist_t *vdev_nv, const char *prepare_str, + char **lines[], int *lines_cnt) +{ + int rc; + char vdev_path[MAXPATHLEN]; + (void) snprintf(vdev_path, sizeof (vdev_path), "%s/%s", DISK_ROOT, + name); + + /* zhp will be NULL when creating a pool */ + rc = zpool_prepare_disk(zhp, vdev_nv, prepare_str, lines, lines_cnt); + if (rc != 0) + return (rc); + + rc = zpool_label_disk(hdl, zhp, name); + return (rc); +} diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index a9b9bf4c2c..ffad7fc02b 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -205,6 +205,15 @@ mutex_enter(kmutex_t *mp) mp->m_owner = pthread_self(); } +int +mutex_enter_check_return(kmutex_t *mp) +{ + int error = pthread_mutex_lock(&mp->m_lock); + if (error == 0) + mp->m_owner = pthread_self(); + return (error); +} + int mutex_tryenter(kmutex_t *mp) { diff --git a/lib/libzutil/os/linux/zutil_import_os.c b/lib/libzutil/os/linux/zutil_import_os.c index 8b64369dc2..44ed697dd4 100644 --- a/lib/libzutil/os/linux/zutil_import_os.c +++ b/lib/libzutil/os/linux/zutil_import_os.c @@ -582,9 +582,8 @@ zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen) * Wait up to timeout_ms for udev to set up the device node. The device is * considered ready when libudev determines it has been initialized, all of * the device links have been verified to exist, and it has been allowed to - * settle. At this point the device the device can be accessed reliably. - * Depending on the complexity of the udev rules this process could take - * several seconds. + * settle. At this point the device can be accessed reliably. Depending on + * the complexity of the udev rules this process could take several seconds. */ int zpool_label_disk_wait(const char *path, int timeout_ms) diff --git a/man/Makefile.am b/man/Makefile.am index 36c1aede10..45156571ee 100644 --- a/man/Makefile.am +++ b/man/Makefile.am @@ -62,6 +62,7 @@ dist_man_MANS = \ %D%/man8/zfs-userspace.8 \ %D%/man8/zfs-wait.8 \ %D%/man8/zfs_ids_to_path.8 \ + %D%/man8/zfs_prepare_disk.8 \ %D%/man8/zgenhostid.8 \ %D%/man8/zinject.8 \ %D%/man8/zpool.8 \ diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 3843419731..4ec52a2fb6 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -402,6 +402,12 @@ Practical upper limit of total metaslabs per top-level vdev. .It Sy metaslab_preload_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Enable metaslab group preloading. . +.It Sy metaslab_preload_limit Ns = Ns Sy 10 Pq uint +Maximum number of metaslabs per group to preload +. +.It Sy metaslab_preload_pct Ns = Ns Sy 50 Pq uint +Percentage of CPUs to run a metaslab preload taskq +. .It Sy metaslab_lba_weighting_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Give more weight to metaslabs with lower LBAs, assuming they have greater bandwidth, @@ -1131,6 +1137,11 @@ Selecting any option other than results in vector instructions from the respective CPU instruction set being used. . +.It Sy zfs_bclone_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int +Enable the experimental block cloning feature. +If this setting is 0, then even if feature@block_cloning is enabled, +attempts to clone blocks will act as though the feature is disabled. +. .It Sy zfs_blake3_impl Ns = Ns Sy fastest Pq string Select a BLAKE3 implementation. .Pp @@ -2144,6 +2155,11 @@ On very fragmented pools, lowering this .Pq typically to Sy 36 KiB can improve performance. . +.It Sy zil_maxcopied Ns = Ns Sy 7680 Ns B Po 7.5 KiB Pc Pq uint +This sets the maximum number of write bytes logged via WR_COPIED. +It tunes a tradeoff between additional memory copy and possibly worse log +space efficiency vs additional range lock/unlock. +. .It Sy zil_min_commit_timeout Ns = Ns Sy 5000 Pq u64 This sets the minimum delay in nanoseconds ZIL care to delay block commit, waiting for more records. @@ -2161,7 +2177,7 @@ if a volatile out-of-order write cache is enabled. Disable intent logging replay. Can be disabled for recovery from corrupted ZIL. . -.It Sy zil_slog_bulk Ns = Ns Sy 786432 Ns B Po 768 KiB Pc Pq u64 +.It Sy zil_slog_bulk Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq u64 Limit SLOG write size per commit executed with synchronous priority. Any writes above that will be executed with lower (asynchronous) priority to limit potential SLOG device abuse by single active ZIL writer. diff --git a/man/man7/zfsconcepts.7 b/man/man7/zfsconcepts.7 index 18a9e9b5ca..1be3d961c3 100644 --- a/man/man7/zfsconcepts.7 +++ b/man/man7/zfsconcepts.7 @@ -28,8 +28,9 @@ .\" Copyright 2019 Richard Laager. All rights reserved. .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. +.\" Copyright 2023 Klara, Inc. .\" -.Dd June 30, 2019 +.Dd October 6, 2023 .Dt ZFSCONCEPTS 7 .Os . @@ -205,3 +206,40 @@ practices, such as regular backups. Consider using the .Sy compression property as a less resource-intensive alternative. +.Ss Block cloning +Block cloning is a facility that allows a file (or parts of a file) to be +.Qq cloned , +that is, a shallow copy made where the existing data blocks are referenced +rather than copied. +Later modifications to the data will cause a copy of the data block to be taken +and that copy modified. +This facility is used to implement +.Qq reflinks +or +.Qq file-level copy-on-write . +.Pp +Cloned blocks are tracked in a special on-disk structure called the Block +Reference Table +.Po BRT +.Pc . +Unlike deduplication, this table has minimal overhead, so can be enabled at all +times. +.Pp +Also unlike deduplication, cloning must be requested by a user program. +Many common file copying programs, including newer versions of +.Nm /bin/cp , +will try to create clones automatically. +Look for +.Qq clone , +.Qq dedupe +or +.Qq reflink +in the documentation for more information. +.Pp +There are some limitations to block cloning. +Only whole blocks can be cloned, and blocks can not be cloned if they are not +yet written to disk, or if they are encrypted, or the source and destination +.Sy recordsize +properties differ. +The OS may add additional restrictions; +for example, most versions of Linux will not allow clones across datasets. diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index b901ce6c29..8ca4bd927b 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -219,8 +219,11 @@ to the end of the line is ignored. .Bd -literal -compact -offset 4n .No example# Nm cat Pa /usr/share/zfs/compatibility.d/grub2 # Features which are supported by GRUB2 +allocation_classes async_destroy +block_cloning bookmarks +device_rebuild embedded_data empty_bpobj enabled_txg @@ -229,8 +232,14 @@ filesystem_limits hole_birth large_blocks livelist +log_spacemap lz4_compress +project_quota +resilver_defer spacemap_histogram +spacemap_v2 +userobj_accounting +zilsaxattr zpool_checkpoint .No example# Nm zpool Cm create Fl o Sy compatibility Ns = Ns Ar grub2 Ar bootpool Ar vdev diff --git a/man/man8/.gitignore b/man/man8/.gitignore index f2fc702147..a468f9cbf9 100644 --- a/man/man8/.gitignore +++ b/man/man8/.gitignore @@ -1,2 +1,3 @@ /zed.8 /zfs-mount-generator.8 +/zfs_prepare_disk.8 diff --git a/man/man8/zfs_prepare_disk.8.in b/man/man8/zfs_prepare_disk.8.in new file mode 100644 index 0000000000..2a741531e4 --- /dev/null +++ b/man/man8/zfs_prepare_disk.8.in @@ -0,0 +1,70 @@ +.\" +.\" Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). +.\" Copyright (C) 2023 Lawrence Livermore National Security, LLC. +.\" Refer to the OpenZFS git commit log for authoritative copyright attribution. +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License Version 1.0 (CDDL-1.0). +.\" You can obtain a copy of the license from the top-level file +.\" "OPENSOLARIS.LICENSE" or at . +.\" You may not use this file except in compliance with the license. +.\" +.\" Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049) +.\" +.Dd August 30, 2023 +.Dt ZFS_PREPARE_DISK 8 +.Os +. +.Sh NAME +.Nm zfs_prepare_disk +.Nd special script that gets run before bringing a disk into a pool +.Sh DESCRIPTION +.Nm +is an optional script that gets called by libzfs before bringing a disk into a +pool. +It can be modified by the user to run whatever commands are necessary to prepare +a disk for inclusion into the pool. +For example, users can add lines to +.Nm zfs_prepare_disk +to do things like update the drive's firmware or check the drive's health. +.Nm zfs_prepare_disk +is optional and can be removed if not needed. +libzfs will look for the script at @zfsexecdir@/zfs_prepare_disk. +. +.Ss Properties +.Nm zfs_prepare_disk +will be passed the following environment variables: +.sp +.Bl -tag -compact -width "VDEV_ENC_SYSFS_PATH" +. +.It Nm POOL_NAME +.No Name of the pool +.It Nm VDEV_PATH +.No Path to the disk (like /dev/sda) +.It Nm VDEV_PREPARE +.No Reason why the disk is being prepared for inclusion +('create', 'add', 'replace', or 'autoreplace'). +This can be useful if you only want the script to be run under certain actions. +.It Nm VDEV_UPATH +.No Path to one of the underlying devices for the +disk. +For multipath this would return one of the /dev/sd* paths to the disk. +If the device is not a device mapper device, then +.Nm VDEV_UPATH +just returns the same value as +.Nm VDEV_PATH +.It Nm VDEV_ENC_SYSFS_PATH +.No Path to the disk's enclosure sysfs path, if available +.El +.Pp +Note that some of these variables may have a blank value. +.Nm POOL_NAME +is blank at pool creation time, for example. +.Sh ENVIRONMENT +.Nm zfs_prepare_disk +runs with a limited $PATH. +.Sh EXIT STATUS +.Nm zfs_prepare_disk +should return 0 on success, non-zero otherwise. +If non-zero is returned, the disk will not be included in the pool. +. diff --git a/module/Kbuild.in b/module/Kbuild.in index 720663f4c0..a9003fa8a2 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -489,6 +489,10 @@ zfs-$(CONFIG_ARM64) += $(addprefix zfs/,$(ZFS_OBJS_ARM64)) zfs-$(CONFIG_PPC) += $(addprefix zfs/,$(ZFS_OBJS_PPC_PPC64)) zfs-$(CONFIG_PPC64) += $(addprefix zfs/,$(ZFS_OBJS_PPC_PPC64)) +UBSAN_SANITIZE_zap_leaf.o := n +UBSAN_SANITIZE_zap_micro.o := n +UBSAN_SANITIZE_sa.o := n + # Suppress incorrect warnings from versions of objtool which are not # aware of x86 EVEX prefix instructions used for AVX512. OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512bw.o := y diff --git a/module/icp/asm-aarch64/sha2/sha256-armv8.S b/module/icp/asm-aarch64/sha2/sha256-armv8.S index fa50c4e74d..7ae486e4e2 100644 --- a/module/icp/asm-aarch64/sha2/sha256-armv8.S +++ b/module/icp/asm-aarch64/sha2/sha256-armv8.S @@ -49,6 +49,7 @@ .type zfs_sha256_block_armv7,%function .align 6 zfs_sha256_block_armv7: + hint #34 // bti c stp x29,x30,[sp,#-128]! add x29,sp,#0 @@ -1015,6 +1016,7 @@ zfs_sha256_block_armv7: .type zfs_sha256_block_armv8,%function .align 6 zfs_sha256_block_armv8: + hint #34 // bti c .Lv8_entry: stp x29,x30,[sp,#-16]! add x29,sp,#0 @@ -1155,6 +1157,7 @@ zfs_sha256_block_armv8: .type zfs_sha256_block_neon,%function .align 4 zfs_sha256_block_neon: + hint #34 // bti c .Lneon_entry: stp x29, x30, [sp, #-16]! mov x29, sp diff --git a/module/icp/asm-aarch64/sha2/sha512-armv8.S b/module/icp/asm-aarch64/sha2/sha512-armv8.S index 1683fc1ca5..9c61eeee4d 100644 --- a/module/icp/asm-aarch64/sha2/sha512-armv8.S +++ b/module/icp/asm-aarch64/sha2/sha512-armv8.S @@ -73,6 +73,7 @@ .type zfs_sha512_block_armv7,%function .align 6 zfs_sha512_block_armv7: + hint #34 // bti c stp x29,x30,[sp,#-128]! add x29,sp,#0 @@ -1040,6 +1041,7 @@ zfs_sha512_block_armv7: .type zfs_sha512_block_armv8,%function .align 6 zfs_sha512_block_armv8: + hint #34 // bti c .Lv8_entry: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later stp x29,x30,[sp,#-16]! diff --git a/module/os/freebsd/spl/spl_taskq.c b/module/os/freebsd/spl/spl_taskq.c index ba22c77b69..842b80ade1 100644 --- a/module/os/freebsd/spl/spl_taskq.c +++ b/module/os/freebsd/spl/spl_taskq.c @@ -30,8 +30,6 @@ __FBSDID("$FreeBSD$"); #include -#include -#include #include #include #include @@ -66,11 +64,9 @@ taskq_t *dynamic_taskq = NULL; proc_t *system_proc; -extern int uma_align_cache; - static MALLOC_DEFINE(M_TASKQ, "taskq", "taskq structures"); -static CK_LIST_HEAD(tqenthashhead, taskq_ent) *tqenthashtbl; +static LIST_HEAD(tqenthashhead, taskq_ent) *tqenthashtbl; static unsigned long tqenthash; static unsigned long tqenthashlock; static struct sx *tqenthashtbl_lock; @@ -80,8 +76,8 @@ static taskqid_t tqidnext; #define TQIDHASH(tqid) (&tqenthashtbl[(tqid) & tqenthash]) #define TQIDHASHLOCK(tqid) (&tqenthashtbl_lock[((tqid) & tqenthashlock)]) +#define NORMAL_TASK 0 #define TIMEOUT_TASK 1 -#define NORMAL_TASK 2 static void system_taskq_init(void *arg) @@ -121,7 +117,7 @@ system_taskq_fini(void *arg) for (i = 0; i < tqenthashlock + 1; i++) sx_destroy(&tqenthashtbl_lock[i]); for (i = 0; i < tqenthash + 1; i++) - VERIFY(CK_LIST_EMPTY(&tqenthashtbl[i])); + VERIFY(LIST_EMPTY(&tqenthashtbl[i])); free(tqenthashtbl_lock, M_TASKQ); free(tqenthashtbl, M_TASKQ); } @@ -162,27 +158,27 @@ taskq_lookup(taskqid_t tqid) { taskq_ent_t *ent = NULL; - sx_xlock(TQIDHASHLOCK(tqid)); - CK_LIST_FOREACH(ent, TQIDHASH(tqid), tqent_hash) { + if (tqid == 0) + return (NULL); + sx_slock(TQIDHASHLOCK(tqid)); + LIST_FOREACH(ent, TQIDHASH(tqid), tqent_hash) { if (ent->tqent_id == tqid) break; } if (ent != NULL) refcount_acquire(&ent->tqent_rc); - sx_xunlock(TQIDHASHLOCK(tqid)); + sx_sunlock(TQIDHASHLOCK(tqid)); return (ent); } static taskqid_t taskq_insert(taskq_ent_t *ent) { - taskqid_t tqid; + taskqid_t tqid = __taskq_genid(); - tqid = __taskq_genid(); ent->tqent_id = tqid; - ent->tqent_registered = B_TRUE; sx_xlock(TQIDHASHLOCK(tqid)); - CK_LIST_INSERT_HEAD(TQIDHASH(tqid), ent, tqent_hash); + LIST_INSERT_HEAD(TQIDHASH(tqid), ent, tqent_hash); sx_xunlock(TQIDHASHLOCK(tqid)); return (tqid); } @@ -192,13 +188,14 @@ taskq_remove(taskq_ent_t *ent) { taskqid_t tqid = ent->tqent_id; - if (!ent->tqent_registered) + if (tqid == 0) return; - sx_xlock(TQIDHASHLOCK(tqid)); - CK_LIST_REMOVE(ent, tqent_hash); + if (ent->tqent_id != 0) { + LIST_REMOVE(ent, tqent_hash); + ent->tqent_id = 0; + } sx_xunlock(TQIDHASHLOCK(tqid)); - ent->tqent_registered = B_FALSE; } static void @@ -285,21 +282,22 @@ taskq_cancel_id(taskq_t *tq, taskqid_t tid) int rc; taskq_ent_t *ent; - if (tid == 0) - return (0); - if ((ent = taskq_lookup(tid)) == NULL) return (0); - ent->tqent_cancelled = B_TRUE; - if (ent->tqent_type == TIMEOUT_TASK) { + if (ent->tqent_type == NORMAL_TASK) { + rc = taskqueue_cancel(tq->tq_queue, &ent->tqent_task, &pend); + if (rc == EBUSY) + taskqueue_drain(tq->tq_queue, &ent->tqent_task); + } else { rc = taskqueue_cancel_timeout(tq->tq_queue, &ent->tqent_timeout_task, &pend); - } else - rc = taskqueue_cancel(tq->tq_queue, &ent->tqent_task, &pend); - if (rc == EBUSY) { - taskqueue_drain(tq->tq_queue, &ent->tqent_task); - } else if (pend) { + if (rc == EBUSY) { + taskqueue_drain_timeout(tq->tq_queue, + &ent->tqent_timeout_task); + } + } + if (pend) { /* * Tasks normally free themselves when run, but here the task * was cancelled so it did not free itself. @@ -312,12 +310,13 @@ taskq_cancel_id(taskq_t *tq, taskqid_t tid) } static void -taskq_run(void *arg, int pending __unused) +taskq_run(void *arg, int pending) { taskq_ent_t *task = arg; - if (!task->tqent_cancelled) - task->tqent_func(task->tqent_arg); + if (pending == 0) + return; + task->tqent_func(task->tqent_arg); taskq_free(task); } @@ -345,7 +344,6 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, task->tqent_func = func; task->tqent_arg = arg; task->tqent_type = TIMEOUT_TASK; - task->tqent_cancelled = B_FALSE; refcount_init(&task->tqent_rc, 1); tqid = taskq_insert(task); TIMEOUT_TASK_INIT(tq->tq_queue, &task->tqent_timeout_task, 0, @@ -379,7 +377,6 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) refcount_init(&task->tqent_rc, 1); task->tqent_func = func; task->tqent_arg = arg; - task->tqent_cancelled = B_FALSE; task->tqent_type = NORMAL_TASK; tqid = taskq_insert(task); TASK_INIT(&task->tqent_task, prio, taskq_run, task); @@ -388,10 +385,12 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) } static void -taskq_run_ent(void *arg, int pending __unused) +taskq_run_ent(void *arg, int pending) { taskq_ent_t *task = arg; + if (pending == 0) + return; task->tqent_func(task->tqent_arg); } @@ -406,8 +405,6 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint32_t flags, * can go at the front of the queue. */ prio = !!(flags & TQ_FRONT); - task->tqent_cancelled = B_FALSE; - task->tqent_registered = B_FALSE; task->tqent_id = 0; task->tqent_func = func; task->tqent_arg = arg; @@ -427,12 +424,13 @@ taskq_wait_id(taskq_t *tq, taskqid_t tid) { taskq_ent_t *ent; - if (tid == 0) - return; if ((ent = taskq_lookup(tid)) == NULL) return; - taskqueue_drain(tq->tq_queue, &ent->tqent_task); + if (ent->tqent_type == NORMAL_TASK) + taskqueue_drain(tq->tq_queue, &ent->tqent_task); + else + taskqueue_drain_timeout(tq->tq_queue, &ent->tqent_timeout_task); taskq_free(ent); } diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c index 12f16edb1e..92696c0bf1 100644 --- a/module/os/freebsd/zfs/arc_os.c +++ b/module/os/freebsd/zfs/arc_os.c @@ -52,11 +52,6 @@ #include #include -#if __FreeBSD_version >= 1300139 -static struct sx arc_vnlru_lock; -static struct vnode *arc_vnlru_marker; -#endif - extern struct vfsops zfs_vfsops; uint_t zfs_arc_free_target = 0; @@ -131,53 +126,6 @@ arc_default_max(uint64_t min, uint64_t allmem) return (MAX(allmem * 5 / 8, size)); } -/* - * Helper function for arc_prune_async() it is responsible for safely - * handling the execution of a registered arc_prune_func_t. - */ -static void -arc_prune_task(void *arg) -{ - uint64_t nr_scan = (uintptr_t)arg; - -#ifndef __ILP32__ - if (nr_scan > INT_MAX) - nr_scan = INT_MAX; -#endif - -#if __FreeBSD_version >= 1300139 - sx_xlock(&arc_vnlru_lock); - vnlru_free_vfsops(nr_scan, &zfs_vfsops, arc_vnlru_marker); - sx_xunlock(&arc_vnlru_lock); -#else - vnlru_free(nr_scan, &zfs_vfsops); -#endif -} - -/* - * Notify registered consumers they must drop holds on a portion of the ARC - * buffered they reference. This provides a mechanism to ensure the ARC can - * honor the metadata limit and reclaim otherwise pinned ARC buffers. This - * is analogous to dnlc_reduce_cache() but more generic. - * - * This operation is performed asynchronously so it may be safely called - * in the context of the arc_reclaim_thread(). A reference is taken here - * for each registered arc_prune_t and the arc_prune_task() is responsible - * for releasing it once the registered arc_prune_func_t has completed. - */ -void -arc_prune_async(uint64_t adjust) -{ - -#ifndef __LP64__ - if (adjust > UINTPTR_MAX) - adjust = UINTPTR_MAX; -#endif - taskq_dispatch(arc_prune_taskq, arc_prune_task, - (void *)(intptr_t)adjust, TQ_SLEEP); - ARCSTAT_BUMP(arcstat_prune); -} - uint64_t arc_all_memory(void) { @@ -228,10 +176,6 @@ arc_lowmem_init(void) { arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, EVENTHANDLER_PRI_FIRST); -#if __FreeBSD_version >= 1300139 - arc_vnlru_marker = vnlru_alloc_marker(); - sx_init(&arc_vnlru_lock, "arc vnlru lock"); -#endif } void @@ -239,12 +183,6 @@ arc_lowmem_fini(void) { if (arc_event_lowmem != NULL) EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); -#if __FreeBSD_version >= 1300139 - if (arc_vnlru_marker != NULL) { - vnlru_free_marker(arc_vnlru_marker); - sx_destroy(&arc_vnlru_lock); - } -#endif } void diff --git a/module/os/freebsd/zfs/event_os.c b/module/os/freebsd/zfs/event_os.c index 97ac151e4f..239d44d0cf 100644 --- a/module/os/freebsd/zfs/event_os.c +++ b/module/os/freebsd/zfs/event_os.c @@ -1,5 +1,5 @@ /* - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022 Rob Wing * diff --git a/module/os/freebsd/zfs/kmod_core.c b/module/os/freebsd/zfs/kmod_core.c index f4c87013db..9a26857352 100644 --- a/module/os/freebsd/zfs/kmod_core.c +++ b/module/os/freebsd/zfs/kmod_core.c @@ -141,7 +141,7 @@ zfsdev_ioctl(struct cdev *dev, ulong_t zcmd, caddr_t arg, int flag, if (len != sizeof (zfs_iocparm_t)) return (EINVAL); - uaddr = (void *)zp->zfs_cmd; + uaddr = (void *)(uintptr_t)zp->zfs_cmd; zc = vmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); #ifdef ZFS_LEGACY_SUPPORT /* diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c index 8ae2f23c3e..38ef590702 100644 --- a/module/os/freebsd/zfs/sysctl_os.c +++ b/module/os/freebsd/zfs/sysctl_os.c @@ -596,28 +596,6 @@ SYSCTL_UINT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, " space map to continue allocations in a first-fit fashion"); /* END CSTYLED */ -/* - * Percentage of all cpus that can be used by the metaslab taskq. - */ -extern int metaslab_load_pct; - -/* BEGIN CSTYLED */ -SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, - CTLFLAG_RWTUN, &metaslab_load_pct, 0, - "Percentage of cpus that can be used by the metaslab taskq"); -/* END CSTYLED */ - -/* - * Max number of metaslabs per group to preload. - */ -extern uint_t metaslab_preload_limit; - -/* BEGIN CSTYLED */ -SYSCTL_UINT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, - CTLFLAG_RWTUN, &metaslab_preload_limit, 0, - "Max number of metaslabs per group to preload"); -/* END CSTYLED */ - /* mmp.c */ int diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c index e8b9ada131..23b8da1845 100644 --- a/module/os/freebsd/zfs/zfs_vfsops.c +++ b/module/os/freebsd/zfs/zfs_vfsops.c @@ -89,6 +89,10 @@ int zfs_debug_level; SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0, "Debug level"); +int zfs_bclone_enabled = 0; +SYSCTL_INT(_vfs_zfs, OID_AUTO, bclone_enabled, CTLFLAG_RWTUN, + &zfs_bclone_enabled, 0, "Enable block cloning"); + struct zfs_jailparam { int mount_snapshot; }; @@ -2070,6 +2074,26 @@ zfs_vnodes_adjust_back(void) #endif } +#if __FreeBSD_version >= 1300139 +static struct sx zfs_vnlru_lock; +static struct vnode *zfs_vnlru_marker; +#endif +static arc_prune_t *zfs_prune; + +static void +zfs_prune_task(uint64_t nr_to_scan, void *arg __unused) +{ + if (nr_to_scan > INT_MAX) + nr_to_scan = INT_MAX; +#if __FreeBSD_version >= 1300139 + sx_xlock(&zfs_vnlru_lock); + vnlru_free_vfsops(nr_to_scan, &zfs_vfsops, zfs_vnlru_marker); + sx_xunlock(&zfs_vnlru_lock); +#else + vnlru_free(nr_to_scan, &zfs_vfsops); +#endif +} + void zfs_init(void) { @@ -2096,11 +2120,23 @@ zfs_init(void) dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info); zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0); + +#if __FreeBSD_version >= 1300139 + zfs_vnlru_marker = vnlru_alloc_marker(); + sx_init(&zfs_vnlru_lock, "zfs vnlru lock"); +#endif + zfs_prune = arc_add_prune_callback(zfs_prune_task, NULL); } void zfs_fini(void) { + arc_remove_prune_callback(zfs_prune); +#if __FreeBSD_version >= 1300139 + vnlru_free_marker(zfs_vnlru_marker); + sx_destroy(&zfs_vnlru_lock); +#endif + taskq_destroy(zfsvfs_taskq); zfsctl_fini(); zfs_znode_fini(); diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index c498a13282..f672deed34 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -6243,6 +6243,11 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap) int error; uint64_t len = *ap->a_lenp; + if (!zfs_bclone_enabled) { + mp = NULL; + goto bad_write_fallback; + } + /* * TODO: If offset/length is not aligned to recordsize, use * vn_generic_copy_file_range() on this fragment. diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c index 29a8802b83..43ed087e2d 100644 --- a/module/os/linux/zfs/arc_os.c +++ b/module/os/linux/zfs/arc_os.c @@ -489,56 +489,5 @@ arc_unregister_hotplug(void) } #endif /* _KERNEL */ -/* - * Helper function for arc_prune_async() it is responsible for safely - * handling the execution of a registered arc_prune_func_t. - */ -static void -arc_prune_task(void *ptr) -{ - arc_prune_t *ap = (arc_prune_t *)ptr; - arc_prune_func_t *func = ap->p_pfunc; - - if (func != NULL) - func(ap->p_adjust, ap->p_private); - - zfs_refcount_remove(&ap->p_refcnt, func); -} - -/* - * Notify registered consumers they must drop holds on a portion of the ARC - * buffered they reference. This provides a mechanism to ensure the ARC can - * honor the metadata limit and reclaim otherwise pinned ARC buffers. This - * is analogous to dnlc_reduce_cache() but more generic. - * - * This operation is performed asynchronously so it may be safely called - * in the context of the arc_reclaim_thread(). A reference is taken here - * for each registered arc_prune_t and the arc_prune_task() is responsible - * for releasing it once the registered arc_prune_func_t has completed. - */ -void -arc_prune_async(uint64_t adjust) -{ - arc_prune_t *ap; - - mutex_enter(&arc_prune_mtx); - for (ap = list_head(&arc_prune_list); ap != NULL; - ap = list_next(&arc_prune_list, ap)) { - - if (zfs_refcount_count(&ap->p_refcnt) >= 2) - continue; - - zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc); - ap->p_adjust = adjust; - if (taskq_dispatch(arc_prune_taskq, arc_prune_task, - ap, TQ_SLEEP) == TASKQID_INVALID) { - zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc); - continue; - } - ARCSTAT_BUMP(arcstat_prune); - } - mutex_exit(&arc_prune_mtx); -} - ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, shrinker_limit, INT, ZMOD_RW, "Limit on number of pages that ARC shrinker can reclaim at once"); diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c index 02cb379ea8..94e25fa0ae 100644 --- a/module/os/linux/zfs/zfs_ctldir.c +++ b/module/os/linux/zfs/zfs_ctldir.c @@ -522,7 +522,7 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, ip->i_blkbits = SPA_MINBLOCKSHIFT; ip->i_atime = now; ip->i_mtime = now; - ip->i_ctime = now; + zpl_inode_set_ctime_to_ts(ip, now); ip->i_fop = fops; ip->i_op = ops; #if defined(IOP_XATTR) diff --git a/module/os/linux/zfs/zfs_uio.c b/module/os/linux/zfs/zfs_uio.c index 3efd4ab159..c2ed67c438 100644 --- a/module/os/linux/zfs/zfs_uio.c +++ b/module/os/linux/zfs/zfs_uio.c @@ -204,22 +204,6 @@ zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) this_seg_start = orig_loffset; rq_for_each_segment(bv, rq, iter) { - if (uio->iter.bio) { - /* - * If uio->iter.bio is present, then we know we've saved - * uio->iter from a previous call to this function, and - * we can skip ahead in this rq_for_each_segment() loop - * to where we last left off. That way, we don't need - * to iterate over tons of segments we've already - * processed - we can just restore the "saved state". - */ - iter = uio->iter; - bv = uio->bv; - this_seg_start = uio->uio_loffset; - memset(&uio->iter, 0, sizeof (uio->iter)); - continue; - } - /* * Lookup what the logical offset of the last byte of this * segment is. @@ -260,19 +244,6 @@ zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) copied = 1; /* We copied some data */ } - if (n == 0) { - /* - * All done copying. Save our 'iter' value to the uio. - * This allows us to "save our state" and skip ahead in - * the rq_for_each_segment() loop the next time we call - * call zfs_uiomove_bvec_rq() on this uio (which we - * will be doing for any remaining data in the uio). - */ - uio->iter = iter; /* make a copy of the struct data */ - uio->bv = bv; - return (0); - } - this_seg_start = this_seg_end + 1; } diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index 8f75291f7c..7bb70439a1 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -1506,7 +1506,7 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) * read-only flag, pretend it was set, as done for snapshots. */ if (!canwrite) - vfs->vfs_readonly = true; + vfs->vfs_readonly = B_TRUE; error = zfsvfs_create(osname, vfs->vfs_readonly, &zfsvfs); if (error) { diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index e8594a994c..9cdce3afdc 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -1684,7 +1684,12 @@ out: * RETURN: 0 (always succeeds) */ int +#ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK +zfs_getattr_fast(zidmap_t *user_ns, u32 request_mask, struct inode *ip, + struct kstat *sp) +#else zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp) +#endif { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); @@ -1697,7 +1702,11 @@ zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp) mutex_enter(&zp->z_lock); +#ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK + zpl_generic_fillattr(user_ns, request_mask, ip, sp); +#else zpl_generic_fillattr(user_ns, ip, sp); +#endif /* * +1 link count for root inode with visible '.zfs' directory. */ @@ -2471,8 +2480,8 @@ top: if (mask & (ATTR_CTIME | ATTR_SIZE)) { ZFS_TIME_ENCODE(&vap->va_ctime, ctime); - ZTOI(zp)->i_ctime = zpl_inode_timestamp_truncate(vap->va_ctime, - ZTOI(zp)); + zpl_inode_set_ctime_to_ts(ZTOI(zp), + zpl_inode_timestamp_truncate(vap->va_ctime, ZTOI(zp))); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, sizeof (ctime)); } @@ -3677,6 +3686,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, caddr_t va; int err = 0; uint64_t mtime[2], ctime[2]; + inode_timespec_t tmp_ctime; sa_bulk_attr_t bulk[3]; int cnt = 0; struct address_space *mapping; @@ -3841,7 +3851,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, /* Preserve the mtime and ctime provided by the inode */ ZFS_TIME_ENCODE(&ip->i_mtime, mtime); - ZFS_TIME_ENCODE(&ip->i_ctime, ctime); + tmp_ctime = zpl_inode_get_ctime(ip); + ZFS_TIME_ENCODE(&tmp_ctime, ctime); zp->z_atime_dirty = B_FALSE; zp->z_seq++; @@ -3891,6 +3902,7 @@ zfs_dirty_inode(struct inode *ip, int flags) zfsvfs_t *zfsvfs = ITOZSB(ip); dmu_tx_t *tx; uint64_t mode, atime[2], mtime[2], ctime[2]; + inode_timespec_t tmp_ctime; sa_bulk_attr_t bulk[4]; int error = 0; int cnt = 0; @@ -3937,7 +3949,8 @@ zfs_dirty_inode(struct inode *ip, int flags) /* Preserve the mode, mtime and ctime provided by the inode */ ZFS_TIME_ENCODE(&ip->i_atime, atime); ZFS_TIME_ENCODE(&ip->i_mtime, mtime); - ZFS_TIME_ENCODE(&ip->i_ctime, ctime); + tmp_ctime = zpl_inode_get_ctime(ip); + ZFS_TIME_ENCODE(&tmp_ctime, ctime); mode = ip->i_mode; zp->z_mode = mode; @@ -4087,8 +4100,8 @@ zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); - if ((vm_flags & VM_WRITE) && (zp->z_pflags & - (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { + if ((vm_flags & VM_WRITE) && (vm_flags & VM_SHARED) && + (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } @@ -4258,4 +4271,8 @@ EXPORT_SYMBOL(zfs_map); module_param(zfs_delete_blocks, ulong, 0644); MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); +/* CSTYLED */ +module_param(zfs_bclone_enabled, uint, 0644); +MODULE_PARM_DESC(zfs_bclone_enabled, "Enable block cloning"); + #endif diff --git a/module/os/linux/zfs/zfs_znode.c b/module/os/linux/zfs/zfs_znode.c index 52c8e51df6..f71026da83 100644 --- a/module/os/linux/zfs/zfs_znode.c +++ b/module/os/linux/zfs/zfs_znode.c @@ -542,6 +542,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, uint64_t links; uint64_t z_uid, z_gid; uint64_t atime[2], mtime[2], ctime[2], btime[2]; + inode_timespec_t tmp_ctime; uint64_t projid = ZFS_DEFAULT_PROJID; sa_bulk_attr_t bulk[12]; int count = 0; @@ -615,7 +616,8 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, ZFS_TIME_DECODE(&ip->i_atime, atime); ZFS_TIME_DECODE(&ip->i_mtime, mtime); - ZFS_TIME_DECODE(&ip->i_ctime, ctime); + ZFS_TIME_DECODE(&tmp_ctime, ctime); + zpl_inode_set_ctime_to_ts(ip, tmp_ctime); ZFS_TIME_DECODE(&zp->z_btime, btime); ip->i_ino = zp->z_id; @@ -1195,6 +1197,7 @@ zfs_rezget(znode_t *zp) uint64_t gen; uint64_t z_uid, z_gid; uint64_t atime[2], mtime[2], ctime[2], btime[2]; + inode_timespec_t tmp_ctime; uint64_t projid = ZFS_DEFAULT_PROJID; znode_hold_t *zh; @@ -1289,7 +1292,8 @@ zfs_rezget(znode_t *zp) ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime); ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime); - ZFS_TIME_DECODE(&ZTOI(zp)->i_ctime, ctime); + ZFS_TIME_DECODE(&tmp_ctime, ctime); + zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ctime); ZFS_TIME_DECODE(&zp->z_btime, btime); if ((uint32_t)gen != ZTOI(zp)->i_generation) { @@ -1397,7 +1401,7 @@ zfs_zinactive(znode_t *zp) boolean_t zfs_relatime_need_update(const struct inode *ip) { - inode_timespec_t now; + inode_timespec_t now, tmp_ctime; gethrestime(&now); /* @@ -1408,7 +1412,8 @@ zfs_relatime_need_update(const struct inode *ip) if (zfs_compare_timespec(&ip->i_mtime, &ip->i_atime) >= 0) return (B_TRUE); - if (zfs_compare_timespec(&ip->i_ctime, &ip->i_atime) >= 0) + tmp_ctime = zpl_inode_get_ctime(ip); + if (zfs_compare_timespec(&tmp_ctime, &ip->i_atime) >= 0) return (B_TRUE); if ((hrtime_t)now.tv_sec - (hrtime_t)ip->i_atime.tv_sec >= 24*60*60) @@ -1434,7 +1439,7 @@ void zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], uint64_t ctime[2]) { - inode_timespec_t now; + inode_timespec_t now, tmp_ctime; gethrestime(&now); @@ -1451,7 +1456,8 @@ zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], if (flag & ATTR_CTIME) { ZFS_TIME_ENCODE(&now, ctime); - ZFS_TIME_DECODE(&(ZTOI(zp)->i_ctime), ctime); + ZFS_TIME_DECODE(&tmp_ctime, ctime); + zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ctime); if (ZTOZSB(zp)->z_use_fuids) zp->z_pflags |= ZFS_ARCHIVE; } diff --git a/module/os/linux/zfs/zpl_ctldir.c b/module/os/linux/zfs/zpl_ctldir.c index 7786444fea..8ee7fcecc7 100644 --- a/module/os/linux/zfs/zpl_ctldir.c +++ b/module/os/linux/zfs/zpl_ctldir.c @@ -124,6 +124,8 @@ zpl_root_getattr_impl(const struct path *path, struct kstat *stat, generic_fillattr(user_ns, ip, stat); #elif defined(HAVE_GENERIC_FILLATTR_IDMAP) generic_fillattr(user_ns, ip, stat); +#elif defined(HAVE_GENERIC_FILLATTR_IDMAP_REQMASK) + generic_fillattr(user_ns, request_mask, ip, stat); #else (void) user_ns; #endif @@ -435,6 +437,8 @@ zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat, generic_fillattr(user_ns, ip, stat); #elif defined(HAVE_GENERIC_FILLATTR_IDMAP) generic_fillattr(user_ns, ip, stat); +#elif defined(HAVE_GENERIC_FILLATTR_IDMAP_REQMASK) + generic_fillattr(user_ns, request_mask, ip, stat); #else (void) user_ns; #endif @@ -609,6 +613,8 @@ zpl_shares_getattr_impl(const struct path *path, struct kstat *stat, generic_fillattr(user_ns, path->dentry->d_inode, stat); #elif defined(HAVE_GENERIC_FILLATTR_IDMAP) generic_fillattr(user_ns, path->dentry->d_inode, stat); +#elif defined(HAVE_GENERIC_FILLATTR_IDMAP_REQMASK) + generic_fillattr(user_ns, request_mask, ip, stat); #else (void) user_ns; #endif @@ -623,7 +629,10 @@ zpl_shares_getattr_impl(const struct path *path, struct kstat *stat, error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp); if (error == 0) { -#if (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR)) +#ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK + error = -zfs_getattr_fast(user_ns, request_mask, ZTOI(dzp), + stat); +#elif (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR)) error = -zfs_getattr_fast(user_ns, ZTOI(dzp), stat); #else error = -zfs_getattr_fast(kcred->user_ns, ZTOI(dzp), stat); diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c index 2abbf44df5..139c51cf46 100644 --- a/module/os/linux/zfs/zpl_file_range.c +++ b/module/os/linux/zfs/zpl_file_range.c @@ -31,6 +31,8 @@ #include #include +int zfs_bclone_enabled = 0; + /* * Clone part of a file via block cloning. * @@ -50,6 +52,9 @@ __zpl_clone_file_range(struct file *src_file, loff_t src_off, fstrans_cookie_t cookie; int err; + if (!zfs_bclone_enabled) + return (-EOPNOTSUPP); + if (!spa_feature_is_enabled( dmu_objset_spa(ITOZSB(dst_i)->z_os), SPA_FEATURE_BLOCK_CLONING)) return (-EOPNOTSUPP); @@ -202,8 +207,10 @@ zpl_ioctl_ficlone(struct file *dst_file, void *arg) if (src_file == NULL) return (-EBADF); - if (dst_file->f_op != src_file->f_op) + if (dst_file->f_op != src_file->f_op) { + fput(src_file); return (-EXDEV); + } size_t len = i_size_read(file_inode(src_file)); @@ -237,8 +244,10 @@ zpl_ioctl_ficlonerange(struct file *dst_file, void __user *arg) if (src_file == NULL) return (-EBADF); - if (dst_file->f_op != src_file->f_op) + if (dst_file->f_op != src_file->f_op) { + fput(src_file); return (-EXDEV); + } size_t len = fcr.fcr_src_length; if (len == 0) diff --git a/module/os/linux/zfs/zpl_inode.c b/module/os/linux/zfs/zpl_inode.c index ae8bc75ddd..adf987b3c9 100644 --- a/module/os/linux/zfs/zpl_inode.c +++ b/module/os/linux/zfs/zpl_inode.c @@ -435,7 +435,9 @@ zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, * XXX query_flags currently ignored. */ -#if (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR)) +#ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK + error = -zfs_getattr_fast(user_ns, request_mask, ip, stat); +#elif (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR)) error = -zfs_getattr_fast(user_ns, ip, stat); #else error = -zfs_getattr_fast(kcred->user_ns, ip, stat); @@ -774,7 +776,7 @@ zpl_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) return (-EMLINK); crhold(cr); - ip->i_ctime = current_time(ip); + zpl_inode_set_ctime_to_ts(ip, current_time(ip)); /* Must have an existing ref, so igrab() cannot return NULL */ VERIFY3P(igrab(ip), !=, NULL); diff --git a/module/os/linux/zfs/zpl_super.c b/module/os/linux/zfs/zpl_super.c index a375e6b98c..670058bd4e 100644 --- a/module/os/linux/zfs/zpl_super.c +++ b/module/os/linux/zfs/zpl_super.c @@ -378,7 +378,7 @@ zpl_kill_sb(struct super_block *sb) } void -zpl_prune_sb(int64_t nr_to_scan, void *arg) +zpl_prune_sb(uint64_t nr_to_scan, void *arg) { struct super_block *sb = (struct super_block *)arg; int objects = 0; diff --git a/module/os/linux/zfs/zpl_xattr.c b/module/os/linux/zfs/zpl_xattr.c index 76895d9d91..4d6ac10495 100644 --- a/module/os/linux/zfs/zpl_xattr.c +++ b/module/os/linux/zfs/zpl_xattr.c @@ -544,7 +544,7 @@ zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value, error = -zfs_write_simple(xzp, value, size, pos, NULL); out: if (error == 0) { - ip->i_ctime = current_time(ip); + zpl_inode_set_ctime_to_ts(ip, current_time(ip)); zfs_mark_inode_dirty(ip); } @@ -1042,7 +1042,8 @@ zpl_set_acl_impl(struct inode *ip, struct posix_acl *acl, int type) */ if (ip->i_mode != mode) { ip->i_mode = ITOZ(ip)->z_mode = mode; - ip->i_ctime = current_time(ip); + zpl_inode_set_ctime_to_ts(ip, + current_time(ip)); zfs_mark_inode_dirty(ip); } @@ -1201,7 +1202,7 @@ zpl_init_acl(struct inode *ip, struct inode *dir) return (PTR_ERR(acl)); if (!acl) { ITOZ(ip)->z_mode = (ip->i_mode &= ~current_umask()); - ip->i_ctime = current_time(ip); + zpl_inode_set_ctime_to_ts(ip, current_time(ip)); zfs_mark_inode_dirty(ip); return (0); } diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 7a95b54bdf..f94ce69fb9 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -873,7 +873,13 @@ zvol_ioctl(struct block_device *bdev, fmode_t mode, switch (cmd) { case BLKFLSBUF: +#ifdef HAVE_FSYNC_BDEV fsync_bdev(bdev); +#elif defined(HAVE_SYNC_BLOCKDEV) + sync_blockdev(bdev); +#else +#error "Neither fsync_bdev() nor sync_blockdev() found" +#endif invalidate_bdev(bdev); rw_enter(&zv->zv_suspend_lock, RW_READER); diff --git a/module/zfs/arc.c b/module/zfs/arc.c index b6ec06407c..df84252bd2 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -748,8 +748,7 @@ taskq_t *arc_prune_taskq; * Other sizes */ -#define HDR_FULL_CRYPT_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) -#define HDR_FULL_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_crypt_hdr)) +#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) /* @@ -887,6 +886,8 @@ static void l2arc_do_free_on_write(void); static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr, boolean_t state_only); +static void arc_prune_async(uint64_t adjust); + #define l2arc_hdr_arcstats_increment(hdr) \ l2arc_hdr_arcstats_update((hdr), B_TRUE, B_FALSE) #define l2arc_hdr_arcstats_decrement(hdr) \ @@ -1113,7 +1114,6 @@ buf_hash_remove(arc_buf_hdr_t *hdr) */ static kmem_cache_t *hdr_full_cache; -static kmem_cache_t *hdr_full_crypt_cache; static kmem_cache_t *hdr_l2only_cache; static kmem_cache_t *buf_cache; @@ -1134,7 +1134,6 @@ buf_fini(void) for (int i = 0; i < BUF_LOCKS; i++) mutex_destroy(BUF_HASH_LOCK(i)); kmem_cache_destroy(hdr_full_cache); - kmem_cache_destroy(hdr_full_crypt_cache); kmem_cache_destroy(hdr_l2only_cache); kmem_cache_destroy(buf_cache); } @@ -1151,7 +1150,6 @@ hdr_full_cons(void *vbuf, void *unused, int kmflag) memset(hdr, 0, HDR_FULL_SIZE); hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; - cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); zfs_refcount_create(&hdr->b_l1hdr.b_refcnt); #ifdef ZFS_DEBUG mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); @@ -1163,19 +1161,6 @@ hdr_full_cons(void *vbuf, void *unused, int kmflag) return (0); } -static int -hdr_full_crypt_cons(void *vbuf, void *unused, int kmflag) -{ - (void) unused; - arc_buf_hdr_t *hdr = vbuf; - - hdr_full_cons(vbuf, unused, kmflag); - memset(&hdr->b_crypt_hdr, 0, sizeof (hdr->b_crypt_hdr)); - arc_space_consume(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS); - - return (0); -} - static int hdr_l2only_cons(void *vbuf, void *unused, int kmflag) { @@ -1211,7 +1196,6 @@ hdr_full_dest(void *vbuf, void *unused) arc_buf_hdr_t *hdr = vbuf; ASSERT(HDR_EMPTY(hdr)); - cv_destroy(&hdr->b_l1hdr.b_cv); zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt); #ifdef ZFS_DEBUG mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); @@ -1220,16 +1204,6 @@ hdr_full_dest(void *vbuf, void *unused) arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); } -static void -hdr_full_crypt_dest(void *vbuf, void *unused) -{ - (void) vbuf, (void) unused; - - hdr_full_dest(vbuf, unused); - arc_space_return(sizeof (((arc_buf_hdr_t *)NULL)->b_crypt_hdr), - ARC_SPACE_HDRS); -} - static void hdr_l2only_dest(void *vbuf, void *unused) { @@ -1285,9 +1259,6 @@ retry: hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0); - hdr_full_crypt_cache = kmem_cache_create("arc_buf_hdr_t_full_crypt", - HDR_FULL_CRYPT_SIZE, 0, hdr_full_crypt_cons, hdr_full_crypt_dest, - NULL, NULL, NULL, 0); hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL, NULL, NULL, 0); @@ -1395,7 +1366,7 @@ arc_buf_is_shared(arc_buf_t *buf) abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) && buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd)); IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); - IMPLY(shared, ARC_BUF_SHARED(buf)); + EQUIV(shared, ARC_BUF_SHARED(buf)); IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); /* @@ -2006,7 +1977,6 @@ arc_buf_untransform_in_place(arc_buf_t *buf) arc_buf_size(buf)); buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; - hdr->b_crypt_hdr.b_ebufcnt -= 1; } /* @@ -2041,7 +2011,7 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, IMPLY(encrypted, HDR_ENCRYPTED(hdr)); IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf)); IMPLY(encrypted, ARC_BUF_COMPRESSED(buf)); - IMPLY(encrypted, !ARC_BUF_SHARED(buf)); + IMPLY(encrypted, !arc_buf_is_shared(buf)); /* * If the caller wanted encrypted data we just need to copy it from @@ -2109,7 +2079,9 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, } if (hdr_compressed == compressed) { - if (!arc_buf_is_shared(buf)) { + if (ARC_BUF_SHARED(buf)) { + ASSERT(arc_buf_is_shared(buf)); + } else { abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd, arc_buf_size(buf)); } @@ -2121,7 +2093,7 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, * If the buf is sharing its data with the hdr, unlink it and * allocate a new data buffer for the buf. */ - if (arc_buf_is_shared(buf)) { + if (ARC_BUF_SHARED(buf)) { ASSERT(ARC_BUF_COMPRESSED(buf)); /* We need to give the buf its own b_data */ @@ -2133,6 +2105,8 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, /* Previously overhead was 0; just add new overhead */ ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); } else if (ARC_BUF_COMPRESSED(buf)) { + ASSERT(!arc_buf_is_shared(buf)); + /* We need to reallocate the buf's b_data */ arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr), buf); @@ -2241,7 +2215,6 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(state)) { - ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); @@ -2261,7 +2234,7 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { - if (arc_buf_is_shared(buf)) + if (ARC_BUF_SHARED(buf)) continue; (void) zfs_refcount_add_many(&state->arcs_esize[type], arc_buf_size(buf), buf); @@ -2281,7 +2254,6 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(state)) { - ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); @@ -2301,7 +2273,7 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { - if (arc_buf_is_shared(buf)) + if (ARC_BUF_SHARED(buf)) continue; (void) zfs_refcount_remove_many(&state->arcs_esize[type], arc_buf_size(buf), buf); @@ -2397,7 +2369,9 @@ arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) l2hdr = &hdr->b_l2hdr; if (l1hdr) { - abi->abi_bufcnt = l1hdr->b_bufcnt; + abi->abi_bufcnt = 0; + for (arc_buf_t *buf = l1hdr->b_buf; buf; buf = buf->b_next) + abi->abi_bufcnt++; abi->abi_access = l1hdr->b_arc_access; abi->abi_mru_hits = l1hdr->b_mru_hits; abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits; @@ -2425,7 +2399,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) { arc_state_t *old_state; int64_t refcnt; - uint32_t bufcnt; boolean_t update_old, update_new; arc_buf_contents_t type = arc_buf_type(hdr); @@ -2439,19 +2412,16 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) if (HDR_HAS_L1HDR(hdr)) { old_state = hdr->b_l1hdr.b_state; refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt); - bufcnt = hdr->b_l1hdr.b_bufcnt; - update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL || - HDR_HAS_RABD(hdr)); + update_old = (hdr->b_l1hdr.b_buf != NULL || + hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); - IMPLY(GHOST_STATE(old_state), bufcnt == 0); - IMPLY(GHOST_STATE(new_state), bufcnt == 0); IMPLY(GHOST_STATE(old_state), hdr->b_l1hdr.b_buf == NULL); IMPLY(GHOST_STATE(new_state), hdr->b_l1hdr.b_buf == NULL); - IMPLY(old_state == arc_anon, bufcnt <= 1); + IMPLY(old_state == arc_anon, hdr->b_l1hdr.b_buf == NULL || + ARC_BUF_LAST(hdr->b_l1hdr.b_buf)); } else { old_state = arc_l2c_only; refcnt = 0; - bufcnt = 0; update_old = B_FALSE; } update_new = update_old; @@ -2499,14 +2469,12 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) if (update_new && new_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(new_state)) { - ASSERT0(bufcnt); /* * When moving a header to a ghost state, we first - * remove all arc buffers. Thus, we'll have a - * bufcnt of zero, and no arc buffer to use for - * the reference. As a result, we use the arc - * header pointer for the reference. + * remove all arc buffers. Thus, we'll have no arc + * buffer to use for the reference. As a result, we + * use the arc header pointer for the reference. */ (void) zfs_refcount_add_many( &new_state->arcs_size[type], @@ -2514,7 +2482,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); } else { - uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, @@ -2523,8 +2490,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { - ASSERT3U(bufcnt, !=, 0); - buffers++; /* * When the arc_buf_t is sharing the data @@ -2533,14 +2498,13 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) * add to the refcount if the arc_buf_t is * not shared. */ - if (arc_buf_is_shared(buf)) + if (ARC_BUF_SHARED(buf)) continue; (void) zfs_refcount_add_many( &new_state->arcs_size[type], arc_buf_size(buf), buf); } - ASSERT3U(bufcnt, ==, buffers); if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_add_many( @@ -2559,7 +2523,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) if (update_old && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(old_state)) { - ASSERT0(bufcnt); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); @@ -2575,7 +2538,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) &old_state->arcs_size[type], HDR_GET_LSIZE(hdr), hdr); } else { - uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, @@ -2584,8 +2546,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { - ASSERT3U(bufcnt, !=, 0); - buffers++; /* * When the arc_buf_t is sharing the data @@ -2594,14 +2554,13 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) * add to the refcount if the arc_buf_t is * not shared. */ - if (arc_buf_is_shared(buf)) + if (ARC_BUF_SHARED(buf)) continue; (void) zfs_refcount_remove_many( &old_state->arcs_size[type], arc_buf_size(buf), buf); } - ASSERT3U(bufcnt, ==, buffers); ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); @@ -2849,9 +2808,6 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb, VERIFY3P(buf->b_data, !=, NULL); hdr->b_l1hdr.b_buf = buf; - hdr->b_l1hdr.b_bufcnt += 1; - if (encrypted) - hdr->b_crypt_hdr.b_ebufcnt += 1; /* * If the user wants the data from the hdr, we need to either copy or @@ -3093,8 +3049,6 @@ arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf) } buf->b_next = NULL; ASSERT3P(lastbuf, !=, buf); - IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL); - IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL); IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf)); return (lastbuf); @@ -3124,31 +3078,30 @@ arc_buf_destroy_impl(arc_buf_t *buf) arc_cksum_verify(buf); arc_buf_unwatch(buf); - if (arc_buf_is_shared(buf)) { + if (ARC_BUF_SHARED(buf)) { arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); } else { + ASSERT(!arc_buf_is_shared(buf)); uint64_t size = arc_buf_size(buf); arc_free_data_buf(hdr, buf->b_data, size, buf); ARCSTAT_INCR(arcstat_overhead_size, -size); } buf->b_data = NULL; - ASSERT(hdr->b_l1hdr.b_bufcnt > 0); - hdr->b_l1hdr.b_bufcnt -= 1; - - if (ARC_BUF_ENCRYPTED(buf)) { - hdr->b_crypt_hdr.b_ebufcnt -= 1; - - /* - * If we have no more encrypted buffers and we've - * already gotten a copy of the decrypted data we can - * free b_rabd to save some space. - */ - if (hdr->b_crypt_hdr.b_ebufcnt == 0 && - HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd != NULL && - !HDR_IO_IN_PROGRESS(hdr)) { - arc_hdr_free_abd(hdr, B_TRUE); + /* + * If we have no more encrypted buffers and we've already + * gotten a copy of the decrypted data we can free b_rabd + * to save some space. + */ + if (ARC_BUF_ENCRYPTED(buf) && HDR_HAS_RABD(hdr) && + hdr->b_l1hdr.b_pabd != NULL && !HDR_IO_IN_PROGRESS(hdr)) { + arc_buf_t *b; + for (b = hdr->b_l1hdr.b_buf; b; b = b->b_next) { + if (b != buf && ARC_BUF_ENCRYPTED(b)) + break; } + if (b == NULL) + arc_hdr_free_abd(hdr, B_TRUE); } } @@ -3169,9 +3122,9 @@ arc_buf_destroy_impl(arc_buf_t *buf) */ if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) { /* Only one buf can be shared at once */ - VERIFY(!arc_buf_is_shared(lastbuf)); + ASSERT(!arc_buf_is_shared(lastbuf)); /* hdr is uncompressed so can't have compressed buf */ - VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); + ASSERT(!ARC_BUF_COMPRESSED(lastbuf)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); arc_hdr_free_abd(hdr, B_FALSE); @@ -3309,11 +3262,7 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, arc_buf_hdr_t *hdr; VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); - if (protected) { - hdr = kmem_cache_alloc(hdr_full_crypt_cache, KM_PUSHPAGE); - } else { - hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); - } + hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); ASSERT(HDR_EMPTY(hdr)); #ifdef ZFS_DEBUG @@ -3336,7 +3285,6 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; hdr->b_l1hdr.b_mfu_ghost_hits = 0; - hdr->b_l1hdr.b_bufcnt = 0; hdr->b_l1hdr.b_buf = NULL; ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); @@ -3362,16 +3310,6 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || (old == hdr_l2only_cache && new == hdr_full_cache)); - /* - * if the caller wanted a new full header and the header is to be - * encrypted we will actually allocate the header from the full crypt - * cache instead. The same applies to freeing from the old cache. - */ - if (HDR_PROTECTED(hdr) && new == hdr_full_cache) - new = hdr_full_crypt_cache; - if (HDR_PROTECTED(hdr) && old == hdr_full_cache) - old = hdr_full_crypt_cache; - nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); @@ -3379,7 +3317,7 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) memcpy(nhdr, hdr, HDR_L2ONLY_SIZE); - if (new == hdr_full_cache || new == hdr_full_crypt_cache) { + if (new == hdr_full_cache) { arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); /* * arc_access and arc_change_state need to be aware that a @@ -3393,7 +3331,6 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) ASSERT(!HDR_HAS_RABD(hdr)); } else { ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - ASSERT0(hdr->b_l1hdr.b_bufcnt); #ifdef ZFS_DEBUG ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); #endif @@ -3459,126 +3396,6 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) return (nhdr); } -/* - * This function allows an L1 header to be reallocated as a crypt - * header and vice versa. If we are going to a crypt header, the - * new fields will be zeroed out. - */ -static arc_buf_hdr_t * -arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt) -{ - arc_buf_hdr_t *nhdr; - arc_buf_t *buf; - kmem_cache_t *ncache, *ocache; - - /* - * This function requires that hdr is in the arc_anon state. - * Therefore it won't have any L2ARC data for us to worry - * about copying. - */ - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(!HDR_HAS_L2HDR(hdr)); - ASSERT3U(!!HDR_PROTECTED(hdr), !=, need_crypt); - ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); - ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); - ASSERT(!list_link_active(&hdr->b_l2hdr.b_l2node)); - ASSERT3P(hdr->b_hash_next, ==, NULL); - - if (need_crypt) { - ncache = hdr_full_crypt_cache; - ocache = hdr_full_cache; - } else { - ncache = hdr_full_cache; - ocache = hdr_full_crypt_cache; - } - - nhdr = kmem_cache_alloc(ncache, KM_PUSHPAGE); - - /* - * Copy all members that aren't locks or condvars to the new header. - * No lists are pointing to us (as we asserted above), so we don't - * need to worry about the list nodes. - */ - nhdr->b_dva = hdr->b_dva; - nhdr->b_birth = hdr->b_birth; - nhdr->b_type = hdr->b_type; - nhdr->b_flags = hdr->b_flags; - nhdr->b_psize = hdr->b_psize; - nhdr->b_lsize = hdr->b_lsize; - nhdr->b_spa = hdr->b_spa; -#ifdef ZFS_DEBUG - nhdr->b_l1hdr.b_freeze_cksum = hdr->b_l1hdr.b_freeze_cksum; -#endif - nhdr->b_l1hdr.b_bufcnt = hdr->b_l1hdr.b_bufcnt; - nhdr->b_l1hdr.b_byteswap = hdr->b_l1hdr.b_byteswap; - nhdr->b_l1hdr.b_state = hdr->b_l1hdr.b_state; - nhdr->b_l1hdr.b_arc_access = hdr->b_l1hdr.b_arc_access; - nhdr->b_l1hdr.b_mru_hits = hdr->b_l1hdr.b_mru_hits; - nhdr->b_l1hdr.b_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits; - nhdr->b_l1hdr.b_mfu_hits = hdr->b_l1hdr.b_mfu_hits; - nhdr->b_l1hdr.b_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits; - nhdr->b_l1hdr.b_acb = hdr->b_l1hdr.b_acb; - nhdr->b_l1hdr.b_pabd = hdr->b_l1hdr.b_pabd; - - /* - * This zfs_refcount_add() exists only to ensure that the individual - * arc buffers always point to a header that is referenced, avoiding - * a small race condition that could trigger ASSERTs. - */ - (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, FTAG); - nhdr->b_l1hdr.b_buf = hdr->b_l1hdr.b_buf; - for (buf = nhdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) - buf->b_hdr = nhdr; - - zfs_refcount_transfer(&nhdr->b_l1hdr.b_refcnt, &hdr->b_l1hdr.b_refcnt); - (void) zfs_refcount_remove(&nhdr->b_l1hdr.b_refcnt, FTAG); - ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt)); - - if (need_crypt) { - arc_hdr_set_flags(nhdr, ARC_FLAG_PROTECTED); - } else { - arc_hdr_clear_flags(nhdr, ARC_FLAG_PROTECTED); - } - - /* unset all members of the original hdr */ - memset(&hdr->b_dva, 0, sizeof (dva_t)); - hdr->b_birth = 0; - hdr->b_type = 0; - hdr->b_flags = 0; - hdr->b_psize = 0; - hdr->b_lsize = 0; - hdr->b_spa = 0; -#ifdef ZFS_DEBUG - hdr->b_l1hdr.b_freeze_cksum = NULL; -#endif - hdr->b_l1hdr.b_buf = NULL; - hdr->b_l1hdr.b_bufcnt = 0; - hdr->b_l1hdr.b_byteswap = 0; - hdr->b_l1hdr.b_state = NULL; - hdr->b_l1hdr.b_arc_access = 0; - hdr->b_l1hdr.b_mru_hits = 0; - hdr->b_l1hdr.b_mru_ghost_hits = 0; - hdr->b_l1hdr.b_mfu_hits = 0; - hdr->b_l1hdr.b_mfu_ghost_hits = 0; - hdr->b_l1hdr.b_acb = NULL; - hdr->b_l1hdr.b_pabd = NULL; - - if (ocache == hdr_full_crypt_cache) { - ASSERT(!HDR_HAS_RABD(hdr)); - hdr->b_crypt_hdr.b_ot = DMU_OT_NONE; - hdr->b_crypt_hdr.b_ebufcnt = 0; - hdr->b_crypt_hdr.b_dsobj = 0; - memset(hdr->b_crypt_hdr.b_salt, 0, ZIO_DATA_SALT_LEN); - memset(hdr->b_crypt_hdr.b_iv, 0, ZIO_DATA_IV_LEN); - memset(hdr->b_crypt_hdr.b_mac, 0, ZIO_DATA_MAC_LEN); - } - - buf_discard_identity(hdr); - kmem_cache_free(ocache, hdr); - - return (nhdr); -} - /* * This function is used by the send / receive code to convert a newly * allocated arc_buf_t to one that is suitable for a raw encrypted write. It @@ -3598,8 +3415,7 @@ arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder, ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED); - if (!HDR_PROTECTED(hdr)) - hdr = arc_hdr_realloc_crypt(hdr, B_TRUE); + arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); hdr->b_crypt_hdr.b_dsobj = dsobj; hdr->b_crypt_hdr.b_ot = ot; hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ? @@ -3800,8 +3616,6 @@ static void arc_hdr_destroy(arc_buf_hdr_t *hdr) { if (HDR_HAS_L1HDR(hdr)) { - ASSERT(hdr->b_l1hdr.b_buf == NULL || - hdr->b_l1hdr.b_bufcnt > 0); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); } @@ -3865,12 +3679,7 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) #ifdef ZFS_DEBUG ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); #endif - - if (!HDR_PROTECTED(hdr)) { - kmem_cache_free(hdr_full_cache, hdr); - } else { - kmem_cache_free(hdr_full_crypt_cache, hdr); - } + kmem_cache_free(hdr_full_cache, hdr); } else { kmem_cache_free(hdr_l2only_cache, hdr); } @@ -3882,7 +3691,8 @@ arc_buf_destroy(arc_buf_t *buf, const void *tag) arc_buf_hdr_t *hdr = buf->b_hdr; if (hdr->b_l1hdr.b_state == arc_anon) { - ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf); + ASSERT(ARC_BUF_LAST(buf)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); VERIFY0(remove_reference(hdr, tag)); return; @@ -3892,7 +3702,7 @@ arc_buf_destroy(arc_buf_t *buf, const void *tag) mutex_enter(hash_lock); ASSERT3P(hdr, ==, buf->b_hdr); - ASSERT(hdr->b_l1hdr.b_bufcnt > 0); + ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); ASSERT3P(buf->b_data, !=, NULL); @@ -3935,7 +3745,6 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, uint64_t *real_evicted) ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt)); @@ -5597,13 +5406,6 @@ arc_read_done(zio_t *zio) buf_hash_remove(hdr); } - /* - * Broadcast before we drop the hash_lock to avoid the possibility - * that the hdr (and hence the cv) might be freed before we get to - * the cv_broadcast(). - */ - cv_broadcast(&hdr->b_l1hdr.b_cv); - arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); (void) remove_reference(hdr, hdr); @@ -5798,8 +5600,7 @@ top: } acb->acb_zio_head = head_zio; acb->acb_next = hdr->b_l1hdr.b_acb; - if (hdr->b_l1hdr.b_acb) - hdr->b_l1hdr.b_acb->acb_prev = acb; + hdr->b_l1hdr.b_acb->acb_prev = acb; hdr->b_l1hdr.b_acb = acb; } mutex_exit(hash_lock); @@ -5939,8 +5740,28 @@ top: * and so the performance impact shouldn't * matter. */ - cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); + arc_callback_t *acb = kmem_zalloc( + sizeof (arc_callback_t), KM_SLEEP); + acb->acb_wait = B_TRUE; + mutex_init(&acb->acb_wait_lock, NULL, + MUTEX_DEFAULT, NULL); + cv_init(&acb->acb_wait_cv, NULL, CV_DEFAULT, + NULL); + acb->acb_zio_head = + hdr->b_l1hdr.b_acb->acb_zio_head; + acb->acb_next = hdr->b_l1hdr.b_acb; + hdr->b_l1hdr.b_acb->acb_prev = acb; + hdr->b_l1hdr.b_acb = acb; mutex_exit(hash_lock); + mutex_enter(&acb->acb_wait_lock); + while (acb->acb_wait) { + cv_wait(&acb->acb_wait_cv, + &acb->acb_wait_lock); + } + mutex_exit(&acb->acb_wait_lock); + mutex_destroy(&acb->acb_wait_lock); + cv_destroy(&acb->acb_wait_cv); + kmem_free(acb, sizeof (arc_callback_t)); goto top; } } @@ -6060,12 +5881,9 @@ top: * 3. This buffer isn't currently writing to the L2ARC. * 4. The L2ARC entry wasn't evicted, which may * also have invalidated the vdev. - * 5. This isn't prefetch or l2arc_noprefetch is 0. */ if (HDR_HAS_L2HDR(hdr) && - !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && - !(l2arc_noprefetch && - (*arc_flags & ARC_FLAG_PREFETCH))) { + !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) { l2arc_read_callback_t *cb; abd_t *abd; uint64_t asize; @@ -6245,6 +6063,56 @@ arc_remove_prune_callback(arc_prune_t *p) kmem_free(p, sizeof (*p)); } +/* + * Helper function for arc_prune_async() it is responsible for safely + * handling the execution of a registered arc_prune_func_t. + */ +static void +arc_prune_task(void *ptr) +{ + arc_prune_t *ap = (arc_prune_t *)ptr; + arc_prune_func_t *func = ap->p_pfunc; + + if (func != NULL) + func(ap->p_adjust, ap->p_private); + + zfs_refcount_remove(&ap->p_refcnt, func); +} + +/* + * Notify registered consumers they must drop holds on a portion of the ARC + * buffers they reference. This provides a mechanism to ensure the ARC can + * honor the metadata limit and reclaim otherwise pinned ARC buffers. + * + * This operation is performed asynchronously so it may be safely called + * in the context of the arc_reclaim_thread(). A reference is taken here + * for each registered arc_prune_t and the arc_prune_task() is responsible + * for releasing it once the registered arc_prune_func_t has completed. + */ +static void +arc_prune_async(uint64_t adjust) +{ + arc_prune_t *ap; + + mutex_enter(&arc_prune_mtx); + for (ap = list_head(&arc_prune_list); ap != NULL; + ap = list_next(&arc_prune_list, ap)) { + + if (zfs_refcount_count(&ap->p_refcnt) >= 2) + continue; + + zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc); + ap->p_adjust = adjust; + if (taskq_dispatch(arc_prune_taskq, arc_prune_task, + ap, TQ_SLEEP) == TASKQID_INVALID) { + zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc); + continue; + } + ARCSTAT_BUMP(arcstat_prune); + } + mutex_exit(&arc_prune_mtx); +} + /* * Notify the arc that a block was freed, and thus will never be used again. */ @@ -6321,7 +6189,8 @@ arc_release(arc_buf_t *buf, const void *tag) ASSERT(!HDR_IN_HASH_TABLE(hdr)); ASSERT(!HDR_HAS_L2HDR(hdr)); - ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf); + ASSERT(ARC_BUF_LAST(buf)); ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); @@ -6372,7 +6241,7 @@ arc_release(arc_buf_t *buf, const void *tag) /* * Do we have more than one buf? */ - if (hdr->b_l1hdr.b_bufcnt > 1) { + if (hdr->b_l1hdr.b_buf != buf || !ARC_BUF_LAST(buf)) { arc_buf_hdr_t *nhdr; uint64_t spa = hdr->b_spa; uint64_t psize = HDR_GET_PSIZE(hdr); @@ -6385,7 +6254,7 @@ arc_release(arc_buf_t *buf, const void *tag) ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); VERIFY3S(remove_reference(hdr, tag), >, 0); - if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) { + if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); ASSERT(ARC_BUF_LAST(buf)); } @@ -6402,9 +6271,9 @@ arc_release(arc_buf_t *buf, const void *tag) * If the current arc_buf_t and the hdr are sharing their data * buffer, then we must stop sharing that block. */ - if (arc_buf_is_shared(buf)) { + if (ARC_BUF_SHARED(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); - VERIFY(!arc_buf_is_shared(lastbuf)); + ASSERT(!arc_buf_is_shared(lastbuf)); /* * First, sever the block sharing relationship between @@ -6437,7 +6306,7 @@ arc_release(arc_buf_t *buf, const void *tag) */ ASSERT(arc_buf_is_shared(lastbuf) || arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); - ASSERT(!ARC_BUF_SHARED(buf)); + ASSERT(!arc_buf_is_shared(buf)); } ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); @@ -6453,10 +6322,6 @@ arc_release(arc_buf_t *buf, const void *tag) arc_buf_size(buf), buf); } - hdr->b_l1hdr.b_bufcnt -= 1; - if (ARC_BUF_ENCRYPTED(buf)) - hdr->b_crypt_hdr.b_ebufcnt -= 1; - arc_cksum_verify(buf); arc_buf_unwatch(buf); @@ -6469,15 +6334,11 @@ arc_release(arc_buf_t *buf, const void *tag) nhdr = arc_hdr_alloc(spa, psize, lsize, protected, compress, hdr->b_complevel, type); ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); - ASSERT0(nhdr->b_l1hdr.b_bufcnt); ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt)); VERIFY3U(nhdr->b_type, ==, type); ASSERT(!HDR_SHARED_DATA(nhdr)); nhdr->b_l1hdr.b_buf = buf; - nhdr->b_l1hdr.b_bufcnt = 1; - if (ARC_BUF_ENCRYPTED(buf)) - nhdr->b_crypt_hdr.b_ebufcnt = 1; (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); buf->b_hdr = nhdr; @@ -6528,7 +6389,7 @@ arc_write_ready(zio_t *zio) ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); - ASSERT(hdr->b_l1hdr.b_bufcnt > 0); + ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL); /* * If we're reexecuting this zio because the pool suspended, then @@ -6539,9 +6400,10 @@ arc_write_ready(zio_t *zio) arc_cksum_free(hdr); arc_buf_unwatch(buf); if (hdr->b_l1hdr.b_pabd != NULL) { - if (arc_buf_is_shared(buf)) { + if (ARC_BUF_SHARED(buf)) { arc_unshare_buf(hdr, buf); } else { + ASSERT(!arc_buf_is_shared(buf)); arc_hdr_free_abd(hdr, B_FALSE); } } @@ -6563,13 +6425,9 @@ arc_write_ready(zio_t *zio) add_reference(hdr, hdr); /* For IO_IN_PROGRESS. */ } - if (BP_IS_PROTECTED(bp) != !!HDR_PROTECTED(hdr)) - hdr = arc_hdr_realloc_crypt(hdr, BP_IS_PROTECTED(bp)); - if (BP_IS_PROTECTED(bp)) { /* ZIL blocks are written through zio_rewrite */ ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG); - ASSERT(HDR_PROTECTED(hdr)); if (BP_SHOULD_BYTESWAP(bp)) { if (BP_GET_LEVEL(bp) > 0) { @@ -6582,11 +6440,14 @@ arc_write_ready(zio_t *zio) hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; } + arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp); hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset; zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv); zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac); + } else { + arc_hdr_clear_flags(hdr, ARC_FLAG_PROTECTED); } /* @@ -6667,7 +6528,8 @@ arc_write_ready(zio_t *zio) } else { ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd)); ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); - ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf); + ASSERT(ARC_BUF_LAST(buf)); arc_share_buf(hdr, buf); } @@ -6748,7 +6610,8 @@ arc_write_done(zio_t *zio) (void *)hdr, (void *)exists); } else { /* Dedup */ - ASSERT(hdr->b_l1hdr.b_bufcnt == 1); + ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL); + ASSERT(ARC_BUF_LAST(hdr->b_l1hdr.b_buf)); ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(BP_GET_DEDUP(zio->io_bp)); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); @@ -6789,7 +6652,7 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, ASSERT(!HDR_IO_ERROR(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); - ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); + ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL); if (uncached) arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED); else if (l2arc) @@ -6839,9 +6702,10 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, * The hdr will remain with a NULL data pointer and the * buf will take sole ownership of the block. */ - if (arc_buf_is_shared(buf)) { + if (ARC_BUF_SHARED(buf)) { arc_unshare_buf(hdr, buf); } else { + ASSERT(!arc_buf_is_shared(buf)); arc_hdr_free_abd(hdr, B_FALSE); } VERIFY3P(buf->b_data, !=, NULL); diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 0eb8c17e33..8451b5082e 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -210,10 +210,12 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) dmu_buf_impl_t *db; rw_enter(&dn->dn_struct_rwlock, RW_READER); - db = dbuf_hold_level(dn, level, blkid, FTAG); + err = dbuf_hold_impl(dn, level, blkid, TRUE, FALSE, FTAG, &db); rw_exit(&dn->dn_struct_rwlock); - if (db == NULL) - return (SET_ERROR(EIO)); + if (err == ENOENT) + return (0); + if (err != 0) + return (err); /* * PARTIAL_FIRST allows caching for uncacheable blocks. It will * be cleared after dmu_buf_will_dirty() call dbuf_read() again. diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 9120fef93c..17b9712482 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -965,18 +965,18 @@ dsl_pool_need_dirty_delay(dsl_pool_t *dp) uint64_t delay_min_bytes = zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; - mutex_enter(&dp->dp_lock); - uint64_t dirty = dp->dp_dirty_total; - mutex_exit(&dp->dp_lock); - - return (dirty > delay_min_bytes); + /* + * We are not taking the dp_lock here and few other places, since torn + * reads are unlikely: on 64-bit systems due to register size and on + * 32-bit due to memory constraints. Pool-wide locks in hot path may + * be too expensive, while we do not need a precise result here. + */ + return (dp->dp_dirty_total > delay_min_bytes); } static boolean_t dsl_pool_need_dirty_sync(dsl_pool_t *dp, uint64_t txg) { - ASSERT(MUTEX_HELD(&dp->dp_lock)); - uint64_t dirty_min_bytes = zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100; uint64_t dirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index cdf599b179..599d7ffa0c 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -207,11 +207,6 @@ static const uint32_t metaslab_min_search_count = 100; */ static int metaslab_df_use_largest_segment = B_FALSE; -/* - * Percentage of all cpus that can be used by the metaslab taskq. - */ -int metaslab_load_pct = 50; - /* * These tunables control how long a metaslab will remain loaded after the * last allocation from it. A metaslab can't be unloaded until at least @@ -856,9 +851,6 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth); } - mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, - maxclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_DYNAMIC); - return (mg); } @@ -874,7 +866,6 @@ metaslab_group_destroy(metaslab_group_t *mg) */ ASSERT(mg->mg_activation_count <= 0); - taskq_destroy(mg->mg_taskq); avl_destroy(&mg->mg_metaslab_tree); mutex_destroy(&mg->mg_lock); mutex_destroy(&mg->mg_ms_disabled_lock); @@ -965,7 +956,7 @@ metaslab_group_passivate(metaslab_group_t *mg) * allocations from taking place and any changes to the vdev tree. */ spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); - taskq_wait_outstanding(mg->mg_taskq, 0); + taskq_wait_outstanding(spa->spa_metaslab_taskq, 0); spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); metaslab_group_alloc_update(mg); for (int i = 0; i < mg->mg_allocators; i++) { @@ -3529,10 +3520,8 @@ metaslab_group_preload(metaslab_group_t *mg) avl_tree_t *t = &mg->mg_metaslab_tree; int m = 0; - if (spa_shutting_down(spa) || !metaslab_preload_enabled) { - taskq_wait_outstanding(mg->mg_taskq, 0); + if (spa_shutting_down(spa) || !metaslab_preload_enabled) return; - } mutex_enter(&mg->mg_lock); @@ -3552,8 +3541,9 @@ metaslab_group_preload(metaslab_group_t *mg) continue; } - VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, - msp, TQ_SLEEP) != TASKQID_INVALID); + VERIFY(taskq_dispatch(spa->spa_metaslab_taskq, metaslab_preload, + msp, TQ_SLEEP | (m <= mg->mg_allocators ? TQ_FRONT : 0)) + != TASKQID_INVALID); } mutex_exit(&mg->mg_lock); } @@ -6182,6 +6172,9 @@ ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_unload, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_enabled, INT, ZMOD_RW, "Preload potential metaslabs during reassessment"); +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_limit, UINT, ZMOD_RW, + "Max number of metaslabs per group to preload"); + ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, UINT, ZMOD_RW, "Delay in txgs after metaslab was last used before unloading"); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 88ee4ea9f4..1410651c63 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -169,6 +169,11 @@ static int spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport); static void spa_vdev_resilver_done(spa_t *spa); +/* + * Percentage of all CPUs that can be used by the metaslab preload taskq. + */ +static uint_t metaslab_preload_pct = 50; + static uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */ static uint_t zio_taskq_batch_tpq; /* threads per taskq */ static const boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ @@ -1397,6 +1402,13 @@ spa_activate(spa_t *spa, spa_mode_t mode) spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri, 1, INT_MAX, 0); + /* + * The taskq to preload metaslabs. + */ + spa->spa_metaslab_taskq = taskq_create("z_metaslab", + metaslab_preload_pct, maxclsyspri, 1, INT_MAX, + TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); + /* * Taskq dedicated to prefetcher threads: this is used to prevent the * pool traverse code from monopolizing the global (and limited) @@ -1432,6 +1444,11 @@ spa_deactivate(spa_t *spa) spa->spa_zvol_taskq = NULL; } + if (spa->spa_metaslab_taskq) { + taskq_destroy(spa->spa_metaslab_taskq); + spa->spa_metaslab_taskq = NULL; + } + if (spa->spa_prefetch_taskq) { taskq_destroy(spa->spa_prefetch_taskq); spa->spa_prefetch_taskq = NULL; @@ -1704,13 +1721,7 @@ spa_unload(spa_t *spa) * This ensures that there is no async metaslab prefetching * while we attempt to unload the spa. */ - if (spa->spa_root_vdev != NULL) { - for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { - vdev_t *vc = spa->spa_root_vdev->vdev_child[c]; - if (vc->vdev_mg != NULL) - taskq_wait(vc->vdev_mg->mg_taskq); - } - } + taskq_wait(spa->spa_metaslab_taskq); if (spa->spa_mmp.mmp_thread) mmp_thread_stop(spa); @@ -3920,6 +3931,24 @@ spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, rvd = mrvd; spa_config_exit(spa, SCL_ALL, FTAG); + /* + * If 'zpool import' used a cached config, then the on-disk hostid and + * hostname may be different to the cached config in ways that should + * prevent import. Userspace can't discover this without a scan, but + * we know, so we add these values to LOAD_INFO so the caller can know + * the difference. + * + * Note that we have to do this before the config is regenerated, + * because the new config will have the hostid and hostname for this + * host, in readiness for import. + */ + if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTID)) + fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_HOSTID, + fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID)); + if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTNAME)) + fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_HOSTNAME, + fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME)); + /* * We will use spa_config if we decide to reload the spa or if spa_load * fails and we rewind. We must thus regenerate the config using the @@ -10132,6 +10161,9 @@ EXPORT_SYMBOL(spa_prop_clear_bootfs); /* asynchronous event notification */ EXPORT_SYMBOL(spa_event_notify); +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_pct, UINT, ZMOD_RW, + "Percentage of CPUs to run a metaslab preload taskq"); + /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW, "log2 fraction of arc that can be used by inflight I/Os when " diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index 636c04d9f7..a77874ea0d 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -367,23 +367,24 @@ spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent, * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration * information for all pool visible within the zone. */ -nvlist_t * -spa_all_configs(uint64_t *generation) +int +spa_all_configs(uint64_t *generation, nvlist_t **pools) { - nvlist_t *pools; spa_t *spa = NULL; if (*generation == spa_config_generation) - return (NULL); + return (SET_ERROR(EEXIST)); - pools = fnvlist_alloc(); + int error = mutex_enter_interruptible(&spa_namespace_lock); + if (error) + return (SET_ERROR(EINTR)); - mutex_enter(&spa_namespace_lock); + *pools = fnvlist_alloc(); while ((spa = spa_next(spa)) != NULL) { if (INGLOBALZONE(curproc) || zone_dataset_visible(spa_name(spa), NULL)) { mutex_enter(&spa->spa_props_lock); - fnvlist_add_nvlist(pools, spa_name(spa), + fnvlist_add_nvlist(*pools, spa_name(spa), spa->spa_config); mutex_exit(&spa->spa_props_lock); } @@ -391,7 +392,7 @@ spa_all_configs(uint64_t *generation) *generation = spa_config_generation; mutex_exit(&spa_namespace_lock); - return (pools); + return (0); } void diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 87c1455932..afb01c0ef7 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -4215,6 +4215,7 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) /* XXX - L2ARC 1.0 does not support expansion */ if (vd->vdev_aux) return (spa_vdev_state_exit(spa, vd, ENOTSUP)); + spa->spa_ccw_fail_time = 0; spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index a5c76808f2..a2e5524a83 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -1138,6 +1138,16 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) POOL_STATE_L2CACHE) == 0); VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); + + /* + * This is merely to facilitate reporting the ashift of the + * cache device through zdb. The actual retrieval of the + * ashift (in vdev_alloc()) uses the nvlist + * spa->spa_l2cache->sav_config (populated in + * spa_ld_open_aux_vdevs()). + */ + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_ASHIFT, + vd->vdev_ashift) == 0); } else { uint64_t txg = 0ULL; diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 08d918467d..092b3f375b 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -273,8 +273,10 @@ vdev_queue_class_add(vdev_queue_t *vq, zio_t *zio) { zio_priority_t p = zio->io_priority; vq->vq_cqueued |= 1U << p; - if (vdev_queue_class_fifo(p)) + if (vdev_queue_class_fifo(p)) { list_insert_tail(&vq->vq_class[p].vqc_list, zio); + vq->vq_class[p].vqc_list_numnodes++; + } else avl_add(&vq->vq_class[p].vqc_tree, zio); } @@ -288,6 +290,7 @@ vdev_queue_class_remove(vdev_queue_t *vq, zio_t *zio) list_t *list = &vq->vq_class[p].vqc_list; list_remove(list, zio); empty = list_is_empty(list); + vq->vq_class[p].vqc_list_numnodes--; } else { avl_tree_t *tree = &vq->vq_class[p].vqc_tree; avl_remove(tree, zio); @@ -1069,7 +1072,7 @@ vdev_queue_class_length(vdev_t *vd, zio_priority_t p) { vdev_queue_t *vq = &vd->vdev_queue; if (vdev_queue_class_fifo(p)) - return (list_is_empty(&vq->vq_class[p].vqc_list) == 0); + return (vq->vq_class[p].vqc_list_numnodes); else return (avl_numnodes(&vq->vq_class[p].vqc_tree)); } diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index 75c3900cbb..6503390f79 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -807,12 +807,12 @@ vdev_rebuild_thread(void *arg) /* * Calculate the max number of in-flight bytes for top-level - * vdev scanning operations (minimum 1MB, maximum 1/4 of + * vdev scanning operations (minimum 1MB, maximum 1/2 of * arc_c_max shared by all top-level vdevs). Limits for the * issuing phase are done per top-level vdev and are handled * separately. */ - uint64_t limit = (arc_c_max / 4) / MAX(rvd->vdev_children, 1); + uint64_t limit = (arc_c_max / 2) / MAX(rvd->vdev_children, 1); vr->vr_bytes_inflight_max = MIN(limit, MAX(1ULL << 20, zfs_rebuild_vdev_limit * vd->vdev_children)); diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index 0d71b94343..03e17db024 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -23,6 +23,7 @@ * Copyright (c) 2016 by Delphix. All rights reserved. * Copyright (c) 2019 by Lawrence Livermore National Security, LLC. * Copyright (c) 2021 Hewlett Packard Enterprise Development LP + * Copyright 2023 RackTop Systems, Inc. */ #include @@ -591,6 +592,7 @@ vdev_trim_ranges(trim_args_t *ta) uint64_t extent_bytes_max = ta->trim_extent_bytes_max; uint64_t extent_bytes_min = ta->trim_extent_bytes_min; spa_t *spa = vd->vdev_spa; + int error = 0; ta->trim_start_time = gethrtime(); ta->trim_bytes_done = 0; @@ -610,19 +612,32 @@ vdev_trim_ranges(trim_args_t *ta) uint64_t writes_required = ((size - 1) / extent_bytes_max) + 1; for (uint64_t w = 0; w < writes_required; w++) { - int error; - error = vdev_trim_range(ta, VDEV_LABEL_START_SIZE + rs_get_start(rs, ta->trim_tree) + (w *extent_bytes_max), MIN(size - (w * extent_bytes_max), extent_bytes_max)); if (error != 0) { - return (error); + goto done; } } } - return (0); +done: + /* + * Make sure all TRIMs for this metaslab have completed before + * returning. TRIM zios have lower priority over regular or syncing + * zios, so all TRIM zios for this metaslab must complete before the + * metaslab is re-enabled. Otherwise it's possible write zios to + * this metaslab could cut ahead of still queued TRIM zios for this + * metaslab causing corruption if the ranges overlap. + */ + mutex_enter(&vd->vdev_trim_io_lock); + while (vd->vdev_trim_inflight[0] > 0) { + cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); + } + mutex_exit(&vd->vdev_trim_io_lock); + + return (error); } static void @@ -941,11 +956,6 @@ vdev_trim_thread(void *arg) } spa_config_exit(spa, SCL_CONFIG, FTAG); - mutex_enter(&vd->vdev_trim_io_lock); - while (vd->vdev_trim_inflight[0] > 0) { - cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); - } - mutex_exit(&vd->vdev_trim_io_lock); range_tree_destroy(ta.trim_tree); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index f91a2f3bbc..2738385e26 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -1582,8 +1582,9 @@ zfs_ioc_pool_configs(zfs_cmd_t *zc) nvlist_t *configs; int error; - if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL) - return (SET_ERROR(EEXIST)); + error = spa_all_configs(&zc->zc_cookie, &configs); + if (error) + return (error); error = put_nvlist(zc, configs); diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index a64e1e2dc8..84e6b10ef3 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1094,6 +1094,15 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, ASSERT(!outzfsvfs->z_replay); + /* + * Block cloning from an unencrypted dataset into an encrypted + * dataset and vice versa is not supported. + */ + if (inos->os_encrypted != outos->os_encrypted) { + zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); + return (SET_ERROR(EXDEV)); + } + error = zfs_verify_zp(inzp); if (error == 0) error = zfs_verify_zp(outzp); @@ -1206,6 +1215,19 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, goto unlock; } + /* + * If we are copying only one block and it is smaller than recordsize + * property, do not allow destination to grow beyond one block if it + * is not there yet. Otherwise the destination will get stuck with + * that block size forever, that can be as small as 512 bytes, no + * matter how big the destination grow later. + */ + if (len <= inblksz && inblksz < outzfsvfs->z_max_blksz && + outzp->z_size <= inblksz && outoff + len > inblksz) { + error = SET_ERROR(EINVAL); + goto unlock; + } + error = zn_rlimit_fsize(outoff + len); if (error != 0) { goto unlock; diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 9e9c9c2254..a118861369 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -145,7 +145,7 @@ static int zil_nocacheflush = 0; * Any writes above that will be executed with lower (asynchronous) priority * to limit potential SLOG device abuse by single active ZIL writer. */ -static uint64_t zil_slog_bulk = 768 * 1024; +static uint64_t zil_slog_bulk = 64 * 1024 * 1024; static kmem_cache_t *zil_lwb_cache; static kmem_cache_t *zil_zcw_cache; @@ -1958,26 +1958,28 @@ zil_max_log_data(zilog_t *zilog, size_t hdrsize) /* * Maximum amount of log space we agree to waste to reduce number of - * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~12%). + * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~6%). */ static inline uint64_t zil_max_waste_space(zilog_t *zilog) { - return (zil_max_log_data(zilog, sizeof (lr_write_t)) / 8); + return (zil_max_log_data(zilog, sizeof (lr_write_t)) / 16); } /* * Maximum amount of write data for WR_COPIED. For correctness, consumers * must fall back to WR_NEED_COPY if we can't fit the entire record into one * maximum sized log block, because each WR_COPIED record must fit in a - * single log block. For space efficiency, we want to fit two records into a - * max-sized log block. + * single log block. Below that it is a tradeoff of additional memory copy + * and possibly worse log space efficiency vs additional range lock/unlock. */ +static uint_t zil_maxcopied = 7680; + uint64_t zil_max_copied_data(zilog_t *zilog) { - return ((zilog->zl_max_block_size - sizeof (zil_chain_t)) / 2 - - sizeof (lr_write_t)); + uint64_t max_data = zil_max_log_data(zilog, sizeof (lr_write_t)); + return (MIN(max_data, zil_maxcopied)); } /* @@ -4226,3 +4228,6 @@ ZFS_MODULE_PARAM(zfs_zil, zil_, slog_bulk, U64, ZMOD_RW, ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, UINT, ZMOD_RW, "Limit in bytes of ZIL log block size"); + +ZFS_MODULE_PARAM(zfs_zil, zil_, maxcopied, UINT, ZMOD_RW, + "Limit in bytes WR_COPIED size"); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 3b3b40fa73..a719e54923 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -158,23 +158,22 @@ zio_init(void) zio_link_cache = kmem_cache_create("zio_link_cache", sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); - /* - * For small buffers, we want a cache for each multiple of - * SPA_MINBLOCKSIZE. For larger buffers, we want a cache - * for each quarter-power of 2. - */ for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { size_t size = (c + 1) << SPA_MINBLOCKSHIFT; + size_t align, cflags, data_cflags; + char name[32]; + + /* + * Create cache for each half-power of 2 size, starting from + * SPA_MINBLOCKSIZE. It should give us memory space efficiency + * of ~7/8, sufficient for transient allocations mostly using + * these caches. + */ size_t p2 = size; - size_t align = 0; - size_t data_cflags, cflags; - - data_cflags = KMC_NODEBUG; - cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ? - KMC_NODEBUG : 0; - while (!ISP2(p2)) p2 &= p2 - 1; + if (!IS_P2ALIGNED(size, p2 / 2)) + continue; #ifndef _KERNEL /* @@ -185,47 +184,37 @@ zio_init(void) */ if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) continue; - /* - * Here's the problem - on 4K native devices in userland on - * Linux using O_DIRECT, buffers must be 4K aligned or I/O - * will fail with EINVAL, causing zdb (and others) to coredump. - * Since userland probably doesn't need optimized buffer caches, - * we just force 4K alignment on everything. - */ - align = 8 * SPA_MINBLOCKSIZE; -#else - if (size < PAGESIZE) { - align = SPA_MINBLOCKSIZE; - } else if (IS_P2ALIGNED(size, p2 >> 2)) { - align = PAGESIZE; - } #endif - if (align != 0) { - char name[36]; - if (cflags == data_cflags) { - /* - * Resulting kmem caches would be identical. - * Save memory by creating only one. - */ - (void) snprintf(name, sizeof (name), - "zio_buf_comb_%lu", (ulong_t)size); - zio_buf_cache[c] = kmem_cache_create(name, - size, align, NULL, NULL, NULL, NULL, NULL, - cflags); - zio_data_buf_cache[c] = zio_buf_cache[c]; - continue; - } - (void) snprintf(name, sizeof (name), "zio_buf_%lu", - (ulong_t)size); - zio_buf_cache[c] = kmem_cache_create(name, size, - align, NULL, NULL, NULL, NULL, NULL, cflags); + if (IS_P2ALIGNED(size, PAGESIZE)) + align = PAGESIZE; + else + align = 1 << (highbit64(size ^ (size - 1)) - 1); - (void) snprintf(name, sizeof (name), "zio_data_buf_%lu", - (ulong_t)size); - zio_data_buf_cache[c] = kmem_cache_create(name, size, - align, NULL, NULL, NULL, NULL, NULL, data_cflags); + cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ? + KMC_NODEBUG : 0; + data_cflags = KMC_NODEBUG; + if (cflags == data_cflags) { + /* + * Resulting kmem caches would be identical. + * Save memory by creating only one. + */ + (void) snprintf(name, sizeof (name), + "zio_buf_comb_%lu", (ulong_t)size); + zio_buf_cache[c] = kmem_cache_create(name, size, align, + NULL, NULL, NULL, NULL, NULL, cflags); + zio_data_buf_cache[c] = zio_buf_cache[c]; + continue; } + (void) snprintf(name, sizeof (name), "zio_buf_%lu", + (ulong_t)size); + zio_buf_cache[c] = kmem_cache_create(name, size, align, + NULL, NULL, NULL, NULL, NULL, cflags); + + (void) snprintf(name, sizeof (name), "zio_data_buf_%lu", + (ulong_t)size); + zio_data_buf_cache[c] = kmem_cache_create(name, size, align, + NULL, NULL, NULL, NULL, NULL, data_cflags); } while (--c != 0) { diff --git a/rpm/generic/zfs-dkms.spec.in b/rpm/generic/zfs-dkms.spec.in index 23c3ed6ff4..d56967d7a8 100644 --- a/rpm/generic/zfs-dkms.spec.in +++ b/rpm/generic/zfs-dkms.spec.in @@ -24,6 +24,7 @@ BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) BuildArch: noarch Requires: dkms >= 2.2.0.3 +Requires(pre): dkms >= 2.2.0.3 Requires(post): dkms >= 2.2.0.3 Requires(preun): dkms >= 2.2.0.3 Requires: gcc, make, perl, diffutils @@ -68,9 +69,92 @@ fi %defattr(-,root,root) /usr/src/%{module}-%{version} +%pre +echo "Running pre installation script: $0. Parameters: $*" +# We don't want any other versions lingering around in dkms. +# Tests with 'dnf' showed that in case of reinstall, or upgrade +# the preun scriptlet removed the version we are trying to install. +# Because of this, find all zfs dkms sources in /var/lib/dkms and +# remove them, if we find a matching version in dkms. + +dkms_root=/var/lib/dkms +if [ -d ${dkms_root}/%{module} ]; then + cd ${dkms_root}/%{module} + for x in [[:digit:]]*; do + [ -d "$x" ] || continue + otherver="$x" + opath="${dkms_root}/%{module}/${otherver}" + if [ "$otherver" != %{version} ]; then + # This is a workaround for a broken 'dkms status', we caused in a previous version. + # One day it might be not needed anymore, but it does not hurt to keep it. + if dkms status -m %{module} -v "$otherver" 2>&1 | grep "${opath}/source/dkms.conf does not exist" + then + echo "ERROR: dkms status is broken!" >&2 + if [ -L "${opath}/source" -a ! -d "${opath}/source" ] + then + echo "Trying to fix it by removing the symlink: ${opath}/source" >&2 + echo "You should manually remove ${opath}" >&2 + rm -f "${opath}/source" || echo "Removal failed!" >&2 + fi + fi + if [ `dkms status -m %{module} -v "$otherver" | grep -c %{module}` -gt 0 ]; then + echo "Removing old %{module} dkms modules version $otherver from all kernels." + dkms remove -m %{module} -v "$otherver" --all ||: + fi + fi + done +fi + +# Uninstall this version of zfs dkms modules before installation of the package. +if [ `dkms status -m %{module} -v %{version} | grep -c %{module}` -gt 0 ]; then + echo "Removing %{module} dkms modules version %{version} from all kernels." + dkms remove -m %{module} -v %{version} --all ||: +fi + +%post +echo "Running post installation script: $0. Parameters: $*" +# Add the module to dkms, as reccommended in the dkms man page. +# This is generally rpm specfic. +# But this also may help, if we have a broken 'dkms status'. +# Because, if the sources are available and only the symlink pointing +# to them is missing, this will resolve the situation +echo "Adding %{module} dkms modules version %{version} to dkms." +dkms add -m %{module} -v %{version} %{!?not_rpm:--rpm_safe_upgrade} ||: + +# After installing the package, dkms install this zfs version for the current kernel. +# Force the overwriting of old modules to avoid diff warnings in dkms status. +# Or in case of a downgrade to overwrite newer versions. +# Or if some other backed up versions have been restored before. +echo "Installing %{module} dkms modules version %{version} for the current kernel." +dkms install --force -m %{module} -v %{version} ||: + %preun -dkms remove -m %{module} -v %{version} --all +dkms_root="/var/lib/dkms/%{module}/%{version}" +echo "Running pre uninstall script: $0. Parameters: $*" +# In case of upgrade we do nothing. See above comment in pre hook. +if [ "$1" = "1" -o "$1" = "upgrade" ] ; then + echo "This is an upgrade. Skipping pre uninstall action." + exit 0 +fi -%posttrans -/usr/lib/dkms/common.postinst %{module} %{version} +# Check if we uninstall the package. In that case remove the dkms modules. +# '0' is the value for the first parameter for rpm packages. +# 'remove' or 'purge' are the possible names for deb packages. +if [ "$1" = "0" -o "$1" = "remove" -o "$1" = "purge" ] ; then + if [ `dkms status -m %{module} -v %{version} | grep -c %{module}` -gt 0 ]; then + echo "Removing %{module} dkms modules version %{version} from all kernels." + dkms remove -m %{module} -v %{version} --all %{!?not_rpm:--rpm_safe_upgrade} && exit 0 + fi + # If removing the modules failed, it might be because of the broken 'dkms status'. + if dkms status -m %{module} -v %{version} 2>&1 | grep "${dkms_root}/source/dkms.conf does not exist" + then + echo "ERROR: dkms status is broken!" >&2 + echo "You should manually remove ${dkms_root}" >&2 + echo "WARNING: installed modules in /lib/modules/`uname -r`/extra could not be removed automatically!" >&2 + fi +else + echo "Script parameter $1 did not match any removal condition." +fi + +exit 0 diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in index 8c538a00d2..2e89abd0ed 100644 --- a/rpm/generic/zfs.spec.in +++ b/rpm/generic/zfs.spec.in @@ -19,6 +19,15 @@ %endif %endif +# Set the default _bashcompletiondir directory based on distribution. +%if %{undefined _bashcompletiondir} +%if 0%{?rhel}%{?fedora}%{?centos}%{?suse_version}%{?openEuler} +%global _bashcompletiondir /etc/bash_completion.d +%else +%global _bashcompletiondir /usr/share/bash-completion +%endif +%endif + # Set the default dracut directory based on distribution. %if %{undefined _dracutdir} %if 0%{?rhel}%{?fedora}%{?centos}%{?suse_version}%{?openEuler} @@ -522,7 +531,7 @@ systemctl --system daemon-reload >/dev/null || true %config(noreplace) %{_sysconfdir}/%{name}/vdev_id.conf.*.example %attr(440, root, root) %config(noreplace) %{_sysconfdir}/sudoers.d/* -%config(noreplace) %{_sysconfdir}/bash_completion.d/zfs +%config(noreplace) %{_bashcompletiondir}/zfs %files -n libzpool5 %{_libdir}/libzpool.so.* diff --git a/scripts/Makefile.am b/scripts/Makefile.am index 4175d27ea3..b43bf97dbd 100644 --- a/scripts/Makefile.am +++ b/scripts/Makefile.am @@ -20,6 +20,8 @@ scripts_scripts = \ if CONFIG_USER dist_scripts_SCRIPTS = $(scripts_scripts) +dist_zfsexec_SCRIPTS = \ + %D%/zfs_prepare_disk else dist_noinst_SCRIPTS += $(scripts_scripts) endif @@ -27,6 +29,7 @@ endif dist_noinst_DATA += \ %D%/cstyle.pl \ %D%/enum-extract.pl \ + %D%/update_authors.pl \ %D%/zfs2zol-patch.sed \ %D%/zol2zfs-patch.sed diff --git a/scripts/update_authors.pl b/scripts/update_authors.pl new file mode 100755 index 0000000000..8dd49b5fb3 --- /dev/null +++ b/scripts/update_authors.pl @@ -0,0 +1,322 @@ +#!/usr/bin/env perl + +# SPDX-License-Identifier: MIT +# +# Copyright (c) 2023, Rob Norris +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + + +# This program will update the AUTHORS file to include commit authors that are +# in the git history but are not yet credited. +# +# The CONTRIBUTORS section of the AUTHORS file attempts to be a list of +# individual contributors to OpenZFS, with one name, address and line per +# person. This is good for readability, but does not really leave room for the +# that names and emails on commits from the same individual can be different, +# for all kinds of reasons, not limited to: +# +# - a person might change organisations, and so their email address changes +# +# - a person might be paid to work on OpenZFS for their employer, and then hack +# on personal projects in the evening, so commits legitimately come from +# different addresses +# +# - names change for all kinds of reasons +# +# To try and account for this, this program will try to find all the possible +# names and emails for a single contributor, and then select the "best" one to +# add to the AUTHORS file. +# +# The CONTRIBUTORS section of the AUTHORS file is considered the source of +# truth. Once an individual committer is listed in there, that line will not be +# removed regardless of what is discovered in the commit history. However, it +# can't just be _anything_. The name or email still has to match something seen +# in the commit history, so that we're able to undertand that its the same +# contributor. +# +# The bulk of the work is in running `git log` to fetch commit author names and +# emails. For each value, we generate a "slug" to use as an internal id for +# that value, which is mostly just the lowercase of the value with whitespace +# and punctuation removed. Two values with subtle differences can produce the +# same slug, so at this point we also try to keep the "best" pre-slug value as +# the display version. We use this slug to update two maps, one of email->name, +# the other of name->email. +# +# Once collected, we then walk all the emails we've seen and get all the names +# associated with every instance. Then for each of those names, we get all the +# emails associated, and so on until we've seen all the connected names and +# emails. This collection is every possible name and email for an individual +# contributor. +# +# Finaly, we consider these groups, and select the "best" name and email for +# the contributor, and add them to the author tables if they aren't there +# already. Once we've done everyone, we write out a new AUTHORS file, and +# that's the whole job. +# +# This is imperfect! Its necessary for the user to examine the diff and make +# sure its sensible. If it hasn't hooked up right, it may necessary to adjust +# the input data (via .mailmap) or improve the heuristics in this program. It +# took a long time to get into good shape when first written (355 new names +# added to AUTHORS!) but hopefully in the future we'll be running this +# regularly so it doesn't fall so far behind. + + +use 5.010; +use warnings; +use strict; + +# Storage for the "best looking" version of name or email, keyed on slug. +my %display_name; +my %display_email; + +# First, we load the existing AUTHORS file. We save everything before +# CONTRIBUTORS: line as-is so we can write it back out to the new file. Then +# we extract name,email pairs from the remainder and store them in a pair of +# hashtables, keyed on slug. +my %authors_name; +my %authors_email; + +my @authors_header; + +for my $line (do { local (@ARGV) = ('AUTHORS'); <> }) { + chomp $line; + state $in_header = 1; + if ($in_header) { + push @authors_header, $line; + $in_header = 0 if $line =~ m/^CONTRIBUTORS:/; + } else { + my ($name, $email) = $line =~ m/^\s+(.+)(?= <) <([^>]+)/; + next unless $name; + + my $semail = email_slug($email); + my $sname = name_slug($name); + + $authors_name{$semail} = $sname; + $authors_email{$sname} = $semail; + + # The name/email in AUTHORS is already the "best looking" + # version, by definition. + $display_name{$sname} = $name; + $display_email{$semail} = $email; + } +} + +# Next, we load all the commit authors. and form name<->email mappings, keyed +# on slug. Note that this format is getting the .mailmap-converted form. This +# lets us control the input to some extent by making changes there. +my %git_names; +my %git_emails; + +for my $line (reverse qx(git log --pretty=tformat:'%aN:::%aE')) { + chomp $line; + my ($name, $email) = $line =~ m/^(.*):::(.*)/; + next unless $name && $email; + + my $semail = email_slug($email); + my $sname = name_slug($name); + + $git_names{$semail}{$sname} = 1; + $git_emails{$sname}{$semail} = 1; + + # Update the "best looking" display value, but only if we don't already + # have something from the AUTHORS file. If we do, we must not change it. + if (!$authors_name{email_slug($email)}) { + update_display_email($email); + } + + if (!$authors_email{name_slug($name)}) { + update_display_name($name); + } +} + +# Now collect unique committers by all names+emails we've ever seen for them. +# We start with emails and resolve all possible names, then we resolve the +# emails for those names, and round and round until there's nothing left. +my @committers; +for my $start_email (sort keys %git_names) { + # it might have been deleted already through a cross-reference + next unless $git_names{$start_email}; + + my %emails; + my %names; + + my @check_emails = ($start_email); + my @check_names; + while (@check_emails || @check_names) { + while (my $email = shift @check_emails) { + next if $emails{$email}++; + push @check_names, + sort keys %{delete $git_names{$email}}; + } + while (my $name = shift @check_names) { + next if $names{$name}++; + push @check_emails, + sort keys %{delete $git_emails{$name}}; + } + } + + # A "committer" is the collection of connected names and emails. + push @committers, [[sort keys %emails], [sort keys %names]]; +} + +# Now we have our committers, we can work out what to add to AUTHORS. +for my $committer (@committers) { + my ($emails, $names) = @$committer; + + # If this commiter is already in AUTHORS, we must not touch. + next if grep { $authors_name{$_} } @$emails; + next if grep { $authors_email{$_} } @$names; + + # Decide on the "best" name and email to use + my $email = best_email(@$emails); + my $name = best_name(@$names); + + $authors_email{$name} = $email; + $authors_name{$email} = $name; +} + +# Now output the new AUTHORS file +open my $fh, '>', 'AUTHORS' or die "E: couldn't open AUTHORS for write: $!\n"; +#my $fh = \*STDOUT; +say $fh join("\n", @authors_header, ""); +for my $name (sort keys %authors_email) { + my $cname = $display_name{$name}; + my $cemail = $display_email{email_slug($authors_email{$name})}; + say $fh " $cname <$cemail>"; +} + +exit 0; + +# "Slugs" are used at the hashtable key for names and emails. They are used to +# making two variants of a value be the "same" for matching. Mostly this is +# to make upper and lower-case versions of a name or email compare the same, +# but we do a little bit of munging to handle some common cases. +# +# Note that these are only used for matching internally; for display, the +# slug will be used to look up the display form. +sub name_slug { + my ($name) = @_; + + # Remove spaces and dots, to handle differences in initials. + $name =~ s/[\s\.]//g; + + return lc $name; +} +sub email_slug { + my ($email) = @_; + + # Remove everything up to and including the first space, and the last + # space and everything after it. + $email =~ s/^(.*\s+)|(\s+.*)$//g; + + # Remove the leading userid+ on Github noreply addresses. They're + # optional and we want to treat them as the same thing. + $email =~ s/^[^\+]*\+//g if $email =~ m/\.noreply\.github\.com$/; + + return lc $email; +} + +sub update_display_name { + my ($name) = @_; + my $sname = name_slug($name); + + # For names, "more specific" means "has more non-lower-case characters" + # (in ASCII), guessing that if a person has gone to some effort to + # specialise their name in a later commit, they presumably care more + # about it. If this is wrong, its probably better to add a .mailmap + # entry. + + my $cname = $display_name{$sname}; + if (!$cname || + ($name =~ tr/a-z //) < ($cname =~ tr/a-z //)) { + $display_name{$sname} = $name; + } +} +sub update_display_email { + my ($email) = @_; + my $semail = email_slug($email); + + # Like names, we prefer uppercase when possible. We also remove any + # leading "plus address" for Github noreply addresses. + $email =~ s/^[^\+]*\+//g if $email =~ m/\.noreply\.github\.com$/; + + my $cemail = $display_email{$semail}; + if (!$cemail || + ($email =~ tr/a-z //) < ($cemail =~ tr/a-z //)) { + $display_email{$semail} = $email; + } +} + +sub best_name { + my @names = sort { + my $cmp; + my ($aa) = $display_name{$a}; + my ($bb) = $display_name{$b}; + + # The "best" name is very subjective, and a simple sort + # produced good-enough results, so I didn't try harder. Use of + # accented characters, punctuation and caps are probably an + # indicator of "better", but possibly we should also take into + # account the most recent name we saw, in case the committer + # has changed their name or nickname or similar. + # + # Really, .mailmap is the place to control this. + + return ($aa cmp $bb); + } @_; + + return shift @names; +} +sub best_email { + state $internal_re = qr/\.(?:internal|local|\(none\))$/; + state $noreply_re = qr/\.noreply\.github\.com$/; + state $freemail_re = qr/\@(?:gmail|hotmail)\.com$/; + + my @emails = sort { + my $cmp; + + # prefer address with a single @ over those without + $cmp = (($b =~ tr/@//) == 1) <=> (($a =~ tr/@//) == 1); + return $cmp unless $cmp == 0; + + # prefer any address over internal/local addresses + $cmp = (($a =~ $internal_re) <=> ($b =~ $internal_re)); + return $cmp unless $cmp == 0; + + # prefer any address over github noreply aliases + $cmp = (($a =~ $noreply_re) <=> ($b =~ $noreply_re)); + return $cmp unless $cmp == 0; + + # prefer any address over freemail providers + $cmp = (($a =~ $freemail_re) <=> ($b =~ $freemail_re)); + return $cmp unless $cmp == 0; + + # alphabetical by domain + my ($alocal, $adom) = split /\@/, $a; + my ($blocal, $bdom) = split /\@/, $b; + $cmp = ($adom cmp $bdom); + return $cmp unless $cmp == 0; + + # alphabetical by local part + return ($alocal cmp $blocal); + } @_; + + return shift @emails; +} diff --git a/scripts/zfs_prepare_disk b/scripts/zfs_prepare_disk new file mode 100755 index 0000000000..02aa9f8a77 --- /dev/null +++ b/scripts/zfs_prepare_disk @@ -0,0 +1,17 @@ +#!/bin/sh +# +# This is an optional helper script that is automatically called by libzfs +# before a disk is about to be added into the pool. It can be modified by +# the user to run whatever commands are necessary to prepare a disk for +# inclusion into the pool. For example, users can add lines to this +# script to do things like update the drive's firmware or check the drive's +# health. The script is optional and can be removed if it is not needed. +# +# See the zfs_prepare_disk(8) man page for details. +# +# Example: +# +# echo "Prepare disk $VDEV_PATH ($VDEV_UPATH) for $VDEV_PREPARE in $POOL_NAME" +# + +exit 0 diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 2c8d5cb0ec..8bc55a1b4b 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -82,6 +82,13 @@ tests = ['zpool_expand_001_pos', 'zpool_expand_002_pos', 'zpool_expand_003_neg', 'zpool_expand_004_pos', 'zpool_expand_005_pos'] tags = ['functional', 'cli_root', 'zpool_expand'] +[tests/functional/cli_root/zpool_import:Linux] +tests = ['zpool_import_hostid_changed', + 'zpool_import_hostid_changed_unclean_export', + 'zpool_import_hostid_changed_cachefile', + 'zpool_import_hostid_changed_cachefile_unclean_export'] +tags = ['functional', 'cli_root', 'zpool_import'] + [tests/functional/cli_root/zpool_reopen:Linux] tests = ['zpool_reopen_001_pos', 'zpool_reopen_002_pos', 'zpool_reopen_003_pos', 'zpool_reopen_004_pos', 'zpool_reopen_005_pos', @@ -115,10 +122,10 @@ tags = ['functional', 'fallocate'] [tests/functional/fault:Linux] tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos', - 'auto_replace_001_pos', 'auto_spare_001_pos', 'auto_spare_002_pos', - 'auto_spare_multiple', 'auto_spare_ashift', 'auto_spare_shared', - 'decrypt_fault', 'decompress_fault', 'scrub_after_resilver', - 'zpool_status_-s'] + 'auto_replace_001_pos', 'auto_replace_002_pos', 'auto_spare_001_pos', + 'auto_spare_002_pos', 'auto_spare_multiple', 'auto_spare_ashift', + 'auto_spare_shared', 'decrypt_fault', 'decompress_fault', + 'scrub_after_resilver', 'zpool_status_-s'] tags = ['functional', 'fault'] [tests/functional/features/large_dnode:Linux] diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 558e4b5727..4608e87522 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -262,7 +262,6 @@ if sys.platform.startswith('freebsd'): 'cli_root/zfs_inherit/zfs_inherit_001_neg': ['FAIL', known_reason], 'cli_root/zpool_import/zpool_import_012_pos': ['FAIL', known_reason], 'delegate/zfs_allow_003_pos': ['FAIL', known_reason], - 'delegate/zfs_allow_010_pos': ['FAIL', known_reason], 'inheritance/inherit_001_pos': ['FAIL', 11829], 'resilver/resilver_restart_001': ['FAIL', known_reason], 'pool_checkpoint/checkpoint_big_rewind': ['FAIL', 12622], @@ -329,6 +328,7 @@ if os.environ.get('CI') == 'true': 'fault/auto_online_001_pos': ['SKIP', ci_reason], 'fault/auto_online_002_pos': ['SKIP', ci_reason], 'fault/auto_replace_001_pos': ['SKIP', ci_reason], + 'fault/auto_replace_002_pos': ['SKIP', ci_reason], 'fault/auto_spare_ashift': ['SKIP', ci_reason], 'fault/auto_spare_shared': ['SKIP', ci_reason], 'procfs/pool_state': ['SKIP', ci_reason], diff --git a/tests/zfs-tests/cmd/dosmode_readonly_write.c b/tests/zfs-tests/cmd/dosmode_readonly_write.c index 0441d1c7b4..b45602d806 100644 --- a/tests/zfs-tests/cmd/dosmode_readonly_write.c +++ b/tests/zfs-tests/cmd/dosmode_readonly_write.c @@ -1,5 +1,5 @@ /* - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2021 iXsystems, Inc. * diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index fa545e06bb..648f2203df 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -130,12 +130,14 @@ export SYSTEM_FILES_LINUX='attr chattr exportfs fallocate + flock free getfattr groupadd groupdel groupmod hostid + logger losetup lsattr lsblk @@ -145,21 +147,20 @@ export SYSTEM_FILES_LINUX='attr md5sum mkswap modprobe + mountpoint mpstat nsenter parted perf setfattr + setpriv sha256sum udevadm unshare useradd userdel usermod - setpriv - mountpoint - flock - logger' + wipefs' export ZFS_FILES='zdb zfs diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 844caa17d8..b4d2b91dd4 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -37,6 +37,12 @@ . ${STF_SUITE}/include/math.shlib . ${STF_SUITE}/include/blkdev.shlib +# On AlmaLinux 9 we will see $PWD = '.' instead of the full path. This causes +# some tests to fail. Fix it up here. +if [ "$PWD" = "." ] ; then + PWD="$(readlink -f $PWD)" +fi + # # Apply constrained path when available. This is required since the # PATH may have been modified by sudo's secure_path behavior. @@ -3334,6 +3340,21 @@ function set_tunable_impl esac } +function save_tunable +{ + [[ ! -d $TEST_BASE_DIR ]] && return 1 + [[ -e $TEST_BASE_DIR/tunable-$1 ]] && return 2 + echo "$(get_tunable """$1""")" > "$TEST_BASE_DIR"/tunable-"$1" +} + +function restore_tunable +{ + [[ ! -e $TEST_BASE_DIR/tunable-$1 ]] && return 1 + val="$(cat $TEST_BASE_DIR/tunable-"""$1""")" + set_tunable64 "$1" "$val" + rm $TEST_BASE_DIR/tunable-$1 +} + # # Get a global system tunable # diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 80e7bcb3bd..a0edad14d0 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -90,6 +90,7 @@ VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev VOL_MODE vol.mode zvol_volmode VOL_RECURSIVE vol.recursive UNSUPPORTED VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq +BCLONE_ENABLED zfs_bclone_enabled zfs_bclone_enabled XATTR_COMPAT xattr_compat zfs_xattr_compat ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 3272a5d581..87b50f59ca 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1104,6 +1104,10 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_import/zpool_import_features_001_pos.ksh \ functional/cli_root/zpool_import/zpool_import_features_002_neg.ksh \ functional/cli_root/zpool_import/zpool_import_features_003_pos.ksh \ + functional/cli_root/zpool_import/zpool_import_hostid_changed.ksh \ + functional/cli_root/zpool_import/zpool_import_hostid_changed_unclean_export.ksh \ + functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile.ksh \ + functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh \ functional/cli_root/zpool_import/zpool_import_missing_001_pos.ksh \ functional/cli_root/zpool_import/zpool_import_missing_002_pos.ksh \ functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh \ @@ -1427,6 +1431,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/fault/auto_online_001_pos.ksh \ functional/fault/auto_online_002_pos.ksh \ functional/fault/auto_replace_001_pos.ksh \ + functional/fault/auto_replace_002_pos.ksh \ functional/fault/auto_spare_001_pos.ksh \ functional/fault/auto_spare_002_pos.ksh \ functional/fault/auto_spare_ashift.ksh \ diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh index 2cd2f4763a..e52b34ec8a 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh @@ -52,6 +52,8 @@ log_must set_tunable64 TXG_TIMEOUT 5000 log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS +log_must sync_pool $TESTPOOL true + log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4 log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288 diff --git a/tests/zfs-tests/tests/functional/block_cloning/cleanup.ksh b/tests/zfs-tests/tests/functional/block_cloning/cleanup.ksh index 7ac13adb63..b985445a5d 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/cleanup.ksh @@ -31,4 +31,8 @@ verify_runnable "global" default_cleanup_noexit +if tunable_exists BCLONE_ENABLED ; then + log_must restore_tunable BCLONE_ENABLED +fi + log_pass diff --git a/tests/zfs-tests/tests/functional/block_cloning/setup.ksh b/tests/zfs-tests/tests/functional/block_cloning/setup.ksh index 512f5a0644..58441bf8f3 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/setup.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/setup.ksh @@ -33,4 +33,9 @@ fi verify_runnable "global" +if tunable_exists BCLONE_ENABLED ; then + log_must save_tunable BCLONE_ENABLED + log_must set_tunable32 BCLONE_ENABLED 1 +fi + log_pass diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_concurrent_shares.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_concurrent_shares.ksh index c226f56e3d..d779689f83 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_concurrent_shares.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_concurrent_shares.ksh @@ -98,11 +98,26 @@ function test_share # filesystem zfs set sharenfs=on $filesystem || \ sub_fail "zfs set sharenfs=on $filesystem failed." - is_shared $mntp || \ - sub_fail "File system $filesystem is not shared (set sharenfs)." # - # Verify 'zfs share' works as well. + # Verify 'zfs share' results in a shared mount. We check this + # multiple times because of Fedora 37+ it's been observed in + # the CI that the share may not be immediately reported. + # + for retry in $(seq 1 10); do + is_shared $mntp && break + + log_note "Wait $retry / 10 for is_shared $mntp (set sharenfs)" + + if [[ $retry -eq 10 ]]; then + sub_fail "File system $filesystem is not shared (set sharenfs)." + fi + + sleep 1 + done + + # + # Verify 'zfs unshare' works as well. # zfs unshare $filesystem || \ sub_fail "zfs unshare $filesystem failed." @@ -112,9 +127,23 @@ function test_share # filesystem zfs share $filesystem || \ sub_fail "zfs share $filesystem failed." - is_shared $mntp || \ - sub_fail "file system $filesystem is not shared (zfs share)." + # + # Verify 'zfs share' results in a shared mount. We check this + # multiple times because of Fedora 37+ it's been observed in + # the CI that the share may not be immediately reported. + # + for retry in $(seq 1 10); do + is_shared $mntp && break + + log_note "Wait $retry / 10 for is_shared $mntp (zfs share)" + + if [[ $retry -eq 10 ]]; then + sub_fail "File system $filesystem is not shared (zfs share)." + fi + + sleep 1 + done #log_note "Sharing a shared file system fails." zfs share $filesystem && \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_007_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_007_pos.ksh index c35ca8e8c9..c7c133a219 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_007_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_007_pos.ksh @@ -34,6 +34,7 @@ # STRATEGY: # 1. Create a pool with a known feature set. # 2. Verify only those features are active/enabled. +# 3. Do this for all known feature sets # verify_runnable "global" @@ -47,8 +48,11 @@ log_onexit cleanup log_assert "creates a pool with a specified feature set enabled" -log_must zpool create -f -o compatibility=compat-2020 $TESTPOOL $DISKS -check_feature_set $TESTPOOL compat-2020 -log_must zpool destroy -f $TESTPOOL +for compat in "$ZPOOL_COMPAT_DIR"/* +do + log_must zpool create -f -o compatibility="${compat##*/}" $TESTPOOL $DISKS + check_feature_set $TESTPOOL "${compat##*/}" + log_must zpool destroy -f $TESTPOOL +done log_pass "creates a pool with a specified feature set enabled" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg index 4a9fb5e748..cf9c6a8499 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg @@ -26,6 +26,7 @@ # # Copyright (c) 2012, 2016 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. # . $STF_SUITE/include/libtest.shlib @@ -63,3 +64,7 @@ export VDEV4=$DEVICE_DIR/${DEVICE_FILE}4 export VDEV5=$DEVICE_DIR/${DEVICE_FILE}5 export ALTER_ROOT=/alter_import-test + +export HOSTID_FILE="/etc/hostid" +export HOSTID1=01234567 +export HOSTID2=89abcdef diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib index 559810ff0e..50157fa805 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib @@ -11,6 +11,7 @@ # # Copyright (c) 2016 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. # . $STF_SUITE/include/libtest.shlib diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed.ksh new file mode 100755 index 0000000000..bc82b7cc1e --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed.ksh @@ -0,0 +1,59 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2021 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# A pool that was cleanly exported should be importable without force even if +# the local hostid doesn't match the on-disk hostid. +# +# STRATEGY: +# 1. Set a hostid. +# 2. Create a pool. +# 3. Export the pool. +# 4. Change the hostid. +# 5. Verify that importing the pool without force succeeds. +# + +verify_runnable "global" + +function custom_cleanup +{ + rm -f $HOSTID_FILE + cleanup +} + +log_onexit custom_cleanup + +# 1. Set a hostid. +log_must zgenhostid -f $HOSTID1 + +# 2. Create a pool. +log_must zpool create $TESTPOOL1 $VDEV0 + +# 3. Export the pool. +log_must zpool export $TESTPOOL1 + +# 4. Change the hostid. +log_must zgenhostid -f $HOSTID2 + +# 5. Verify that importing the pool without force succeeds. +log_must zpool import -d $DEVICE_DIR $TESTPOOL1 + +log_pass "zpool import can import cleanly exported pool when hostid changes." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile.ksh new file mode 100755 index 0000000000..07c43482d6 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile.ksh @@ -0,0 +1,65 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2021 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# A pool that was cleanly exported should be importable from a cachefile +# without force even if the local hostid doesn't match the on-disk hostid. +# +# STRATEGY: +# 1. Set a hostid. +# 2. Create a pool with a cachefile. +# 3. Backup the cachfile. +# 4. Export the pool. +# 5. Change the hostid. +# 6. Verify that importing the pool from the cachefile succeeds +# without force. +# + +verify_runnable "global" + +function custom_cleanup +{ + rm -f $HOSTID_FILE $CPATH $CPATHBKP + cleanup +} + +log_onexit custom_cleanup + +# 1. Set a hostid. +log_must zgenhostid -f $HOSTID1 + +# 2. Create a pool. +log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $VDEV0 + +# 3. Backup the cachfile. +log_must cp $CPATH $CPATHBKP + +# 4. Export the pool. +log_must zpool export $TESTPOOL1 + +# 5. Change the hostid. +log_must zgenhostid -f $HOSTID2 + +# 6. Verify that importing the pool from the cachefile succeeds without force. +log_must zpool import -c $CPATHBKP $TESTPOOL1 + +log_pass "zpool import can import cleanly exported pool from cachefile " \ + "when hostid changes." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh new file mode 100755 index 0000000000..dcb1ac1ab6 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh @@ -0,0 +1,75 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2021 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# A pool that wasn't cleanly exported should not be importable from a cachefile +# without force if the local hostid doesn't match the on-disk hostid. +# +# STRATEGY: +# 1. Set a hostid. +# 2. Create a pool. +# 3. Backup the cachefile. +# 4. Simulate the pool being torn down without export: +# 4.1. Copy the underlying device state. +# 4.2. Export the pool. +# 4.3. Restore the device state from the copy. +# 5. Change the hostid. +# 6. Verify that importing the pool from the cachefile fails. +# 7. Verify that importing the pool from the cachefile with force +# succeeds. +# + +verify_runnable "global" + +function custom_cleanup +{ + rm -f $HOSTID_FILE $CPATH $CPATHBKP $VDEV0.bak + cleanup +} + +log_onexit custom_cleanup + +# 1. Set a hostid. +log_must zgenhostid -f $HOSTID1 + +# 2. Create a pool. +log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $VDEV0 + +# 3. Backup the cachfile. +log_must cp $CPATH $CPATHBKP + +# 4. Simulate the pool being torn down without export. +log_must cp $VDEV0 $VDEV0.bak +log_must zpool export $TESTPOOL1 +log_must cp -f $VDEV0.bak $VDEV0 +log_must rm -f $VDEV0.bak + +# 5. Change the hostid. +log_must zgenhostid -f $HOSTID2 + +# 6. Verify that importing the pool from the cachefile fails. +log_mustnot zpool import -c $CPATHBKP $TESTPOOL1 + +# 7. Verify that importing the pool from the cachefile with force succeeds. +log_must zpool import -f -c $CPATHBKP $TESTPOOL1 + +log_pass "zpool import from cachefile requires force if not cleanly " \ + "exported and hostid changes." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_unclean_export.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_unclean_export.ksh new file mode 100755 index 0000000000..ad8cca642d --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_unclean_export.ksh @@ -0,0 +1,70 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2021 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# A pool that wasn't cleanly exported should not be importable without force if +# the local hostid doesn't match the on-disk hostid. +# +# STRATEGY: +# 1. Set a hostid. +# 2. Create a pool. +# 3. Simulate the pool being torn down without export: +# 3.1. Copy the underlying device state. +# 3.2. Export the pool. +# 3.3. Restore the device state from the copy. +# 4. Change the hostid. +# 5. Verify that importing the pool fails. +# 6. Verify that importing the pool with force succeeds. +# + +verify_runnable "global" + +function custom_cleanup +{ + rm -f $HOSTID_FILE $VDEV0.bak + cleanup +} + +log_onexit custom_cleanup + +# 1. Set a hostid. +log_must zgenhostid -f $HOSTID1 + +# 2. Create a pool. +log_must zpool create $TESTPOOL1 $VDEV0 + +# 3. Simulate the pool being torn down without export. +log_must cp $VDEV0 $VDEV0.bak +log_must zpool export $TESTPOOL1 +log_must cp -f $VDEV0.bak $VDEV0 +log_must rm -f $VDEV0.bak + +# 4. Change the hostid. +log_must zgenhostid -f $HOSTID2 + +# 5. Verify that importing the pool fails. +log_mustnot zpool import -d $DEVICE_DIR $TESTPOOL1 + +# 6. Verify that importing the pool with force succeeds. +log_must zpool import -d $DEVICE_DIR -f $TESTPOOL1 + +log_pass "zpool import requires force if not cleanly exported " \ + "and hostid changed." diff --git a/tests/zfs-tests/tests/functional/cli_user/misc/misc.cfg b/tests/zfs-tests/tests/functional/cli_user/misc/misc.cfg index e98b5e8b22..9c76a8780b 100644 --- a/tests/zfs-tests/tests/functional/cli_user/misc/misc.cfg +++ b/tests/zfs-tests/tests/functional/cli_user/misc/misc.cfg @@ -29,7 +29,7 @@ # if is_linux; then - # these are the set of setable ZFS properties + # these are the set of settable ZFS properties PROP_NAMES="\ acltype atime \ checksum compression devices \ @@ -81,7 +81,7 @@ elif is_freebsd; then hidden" else - # these are the set of setable ZFS properties + # these are the set of settable ZFS properties PROP_NAMES="\ aclinherit aclmode atime \ checksum compression devices \ diff --git a/tests/zfs-tests/tests/functional/delegate/delegate_common.kshlib b/tests/zfs-tests/tests/functional/delegate/delegate_common.kshlib index 3f2f089e81..5ddb6ca2dd 100644 --- a/tests/zfs-tests/tests/functional/delegate/delegate_common.kshlib +++ b/tests/zfs-tests/tests/functional/delegate/delegate_common.kshlib @@ -861,7 +861,7 @@ function verify_fs_mount fi if ! ismounted $fs ; then - log_must zfs set mountpoint=$newmntpt $fs + log_must zfs set -u mountpoint=$newmntpt $fs log_must rm -rf $newmntpt log_must mkdir $newmntpt @@ -878,7 +878,7 @@ function verify_fs_mount fi log_must zfs umount $fs log_must rm -rf $newmntpt - log_must zfs set mountpoint=$mntpt $fs + log_must zfs set -u mountpoint=$mntpt $fs fi return 0 diff --git a/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh index d8a8b9ac15..b6b9f789ed 100755 --- a/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh @@ -34,13 +34,14 @@ # 1. Update /etc/zfs/vdev_id.conf with scsidebug alias for a persistent path. # This creates keys ID_VDEV and ID_VDEV_PATH and set phys_path="scsidebug". # 2. Create a pool and set autoreplace=on (auto-replace is opt-in) -# 3. Export a pool +# 3. Export the pool # 4. Wipe and offline the scsi_debug disk -# 5. Import pool with missing disk +# 5. Import the pool with missing disk # 6. Re-online the wiped scsi_debug disk -# 7. Verify the ZED detects the new unused disk and adds it back to the pool +# 7. Verify ZED detects the new blank disk and replaces the missing vdev +# 8. Verify that the scsi_debug disk was re-partitioned # -# Creates a raidz1 zpool using persistent disk path names +# Creates a raidz1 zpool using persistent /dev/disk/by-vdev path names # (ie not /dev/sdc) # # Auto-replace is opt in, and matches by phys_path. @@ -85,11 +86,27 @@ log_must zpool create -f $TESTPOOL raidz1 $SD_DEVICE $DISK1 $DISK2 $DISK3 log_must zpool set autoreplace=on $TESTPOOL # Add some data to the pool -log_must mkfile $FSIZE /$TESTPOOL/data +log_must zfs create $TESTPOOL/fs +log_must fill_fs /$TESTPOOL/fs 4 100 4096 512 Z log_must zpool export $TESTPOOL +# Record the partition UUID for later comparison +part_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \ + --value /dev/disk/by-id/$SD_DEVICE_ID) +[[ -z "$part_uuid" ]] || log_note original disk GPT uuid ${part_uuid} + +# # Wipe and offline the disk +# +# Note that it is not enough to zero the disk to expunge the partitions. +# You also need to inform the kernel (e.g., 'hdparm -z' or 'partprobe'). +# +# Using partprobe is overkill and hdparm is not as common as wipefs. So +# we use wipefs which lets the kernel know the partition was removed +# from the device (i.e., calls BLKRRPART ioctl). +# log_must dd if=/dev/zero of=/dev/disk/by-id/$SD_DEVICE_ID bs=1M count=$SDSIZE +log_must /usr/sbin/wipefs -a /dev/disk/by-id/$SD_DEVICE_ID remove_disk $SD block_device_wait @@ -108,4 +125,18 @@ log_must wait_replacing $TESTPOOL 60 # Validate auto-replace was successful log_must check_state $TESTPOOL "" "ONLINE" +# +# Confirm the partition UUID changed so we know the new disk was relabeled +# +# Note: some older versions of udevadm don't support "--property" option so +# we'll # skip this test when it is not supported +# +if [ ! -z "$part_uuid" ]; then + new_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \ + --value /dev/disk/by-id/$SD_DEVICE_ID) + log_note new disk GPT uuid ${new_uuid} + [[ "$part_uuid" = "$new_uuid" ]] && \ + log_fail "The new disk was not relabeled as expected" +fi + log_pass "Auto-replace test successful" diff --git a/tests/zfs-tests/tests/functional/fault/auto_replace_002_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_replace_002_pos.ksh new file mode 100755 index 0000000000..2259e60431 --- /dev/null +++ b/tests/zfs-tests/tests/functional/fault/auto_replace_002_pos.ksh @@ -0,0 +1,192 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright (c) 2017 by Intel Corporation. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/fault/fault.cfg + +# +# DESCRIPTION: +# Testing Fault Management Agent ZED Logic - Automated Auto-Replace Test. +# Verifys that auto-replace works with by-id paths. +# +# STRATEGY: +# 1. Update /etc/zfs/vdev_id.conf with scsidebug alias for a persistent path. +# This creates keys ID_VDEV and ID_VDEV_PATH and set phys_path="scsidebug". +# 2. Create a pool and set autoreplace=on (auto-replace is opt-in) +# 3. Export the pool +# 4. Wipe and offline the scsi_debug disk +# 5. Import the pool with missing disk +# 6. Re-online the wiped scsi_debug disk with a new serial number +# 7. Verify ZED detects the new blank disk and replaces the missing vdev +# 8. Verify that the scsi_debug disk was re-partitioned +# +# Creates a raidz1 zpool using persistent /dev/disk/by-id path names +# +# Auto-replace is opt in, and matches by phys_path. +# + +verify_runnable "both" + +if ! is_physical_device $DISKS; then + log_unsupported "Unsupported disks for this test." +fi + +function cleanup +{ + zpool status $TESTPOOL + destroy_pool $TESTPOOL + sed -i '/alias scsidebug/d' $VDEVID_CONF + unload_scsi_debug +} + +# +# Wait until a vdev transitions to its replacement vdev +# +# Return 0 when vdev reaches expected state, 1 on timeout. +# +# Note: index +2 is to skip over root and raidz-0 vdevs +# +function wait_vdev_online # pool index oldguid timeout +{ + typeset pool=$1 + typeset -i index=$2+2 + typeset guid=$3 + typeset timeout=${4:-60} + typeset -i i=0 + + while [[ $i -lt $timeout ]]; do + vdev_guids=( $(zpool get -H -o value guid $pool all-vdevs) ) + + if [ "${vdev_guids[$index]}" != "${guid}" ]; then + log_note "new vdev[$((index-2))]: ${vdev_guids[$index]}, replacing ${guid}" + return 0 + fi + + i=$((i+1)) + sleep 1 + done + + return 1 +} +log_assert "automated auto-replace with by-id paths" +log_onexit cleanup + +load_scsi_debug $SDSIZE $SDHOSTS $SDTGTS $SDLUNS '512b' +SD=$(get_debug_device) +SD_DEVICE_ID=$(get_persistent_disk_name $SD) +SD_HOST=$(get_scsi_host $SD) + +# Register vdev_id alias for scsi_debug device to create a persistent path +echo "alias scsidebug /dev/disk/by-id/$SD_DEVICE_ID" >>$VDEVID_CONF +block_device_wait + +SD_DEVICE=$(udevadm info -q all -n $DEV_DSKDIR/$SD | \ + awk -F'=' '/ID_VDEV=/ {print $2; exit}') +[ -z $SD_DEVICE ] && log_fail "vdev rule was not registered properly" + +log_must zpool events -c +log_must zpool create -f $TESTPOOL raidz1 $SD_DEVICE_ID $DISK1 $DISK2 $DISK3 + +vdev_guid=$(zpool get guid -H -o value $TESTPOOL $SD_DEVICE_ID) +log_note original vdev guid ${vdev_guid} + +# Auto-replace is opt-in so need to set property +log_must zpool set autoreplace=on $TESTPOOL + +# Add some data to the pool +log_must zfs create $TESTPOOL/fs +log_must fill_fs /$TESTPOOL/fs 4 100 4096 512 Z +log_must zpool export $TESTPOOL + +# Record the partition UUID for later comparison +part_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \ + --value /dev/disk/by-id/$SD_DEVICE_ID) +[[ -z "$part_uuid" ]] || log_note original disk GPT uuid ${part_uuid} + +# +# Wipe and offline the disk +# +# Note that it is not enough to zero the disk to expunge the partitions. +# You also need to inform the kernel (e.g., 'hdparm -z' or 'partprobe'). +# +# Using partprobe is overkill and hdparm is not as common as wipefs. So +# we use wipefs which lets the kernel know the partition was removed +# from the device (i.e., calls BLKRRPART ioctl). +# +log_must dd if=/dev/zero of=/dev/disk/by-id/$SD_DEVICE_ID bs=1M count=$SDSIZE +log_must /usr/sbin/wipefs -a /dev/disk/by-id/$SD_DEVICE_ID +remove_disk $SD +block_device_wait + +# Re-import pool with drive missing +log_must zpool import $TESTPOOL +log_must check_state $TESTPOOL "" "DEGRADED" +block_device_wait + +# +# Online an empty disk in the same physical location, with a different by-id +# symlink. We use vpd_use_hostno to make sure the underlying serial number +# changes for the new disk which in turn gives us a different by-id path. +# +# The original names were something like: +# /dev/disk/by-id/scsi-SLinux_scsi_debug_16000-part1 +# /dev/disk/by-id/wwn-0x33333330000007d0-part1 +# +# This new inserted disk, will have different links like: +# /dev/disk/by-id/scsi-SLinux_scsi_debug_2000-part1 +# /dev/disk/by-id/wwn-0x0x3333333000003e80 -part1 +# +echo '0' > /sys/bus/pseudo/drivers/scsi_debug/vpd_use_hostno + +insert_disk $SD $SD_HOST + +# make sure the physical path points to the same scsi-debug device +SD_DEVICE_ID=$(get_persistent_disk_name $SD) +echo "alias scsidebug /dev/disk/by-id/$SD_DEVICE_ID" >>$VDEVID_CONF +block_device_wait + +# Wait for the new disk to be online and replaced +log_must wait_vdev_online $TESTPOOL 0 $vdev_guid 45 +log_must wait_replacing $TESTPOOL 45 + +# Validate auto-replace was successful +log_must check_state $TESTPOOL "" "ONLINE" + +# +# Confirm the partition UUID changed so we know the new disk was relabeled +# +# Note: some older versions of udevadm don't support "--property" option so +# we'll # skip this test when it is not supported +# +if [ ! -z "$part_uuid" ]; then + new_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \ + --value /dev/disk/by-id/$SD_DEVICE_ID) + log_note new disk GPT uuid ${new_uuid} + [[ "$part_uuid" = "$new_uuid" ]] && \ + log_fail "The new disk was not relabeled as expected" +fi + +log_pass "automated auto-replace with by-id paths" diff --git a/tests/zfs-tests/tests/functional/procfs/pool_state.ksh b/tests/zfs-tests/tests/functional/procfs/pool_state.ksh index 7a02eb68ab..bae8763791 100755 --- a/tests/zfs-tests/tests/functional/procfs/pool_state.ksh +++ b/tests/zfs-tests/tests/functional/procfs/pool_state.ksh @@ -141,7 +141,11 @@ remove_disk $SDISK # background since the command will hang when the pool gets suspended. The # command will resume and exit after we restore the missing disk later on. zpool scrub $TESTPOOL2 & -sleep 3 # Give the scrub some time to run before we check if it fails +# Once we trigger the zpool scrub, all zpool/zfs command gets stuck for 180 seconds. +# Post 180 seconds zpool/zfs commands gets start executing however few more seconds(10s) +# it take to update the status. +# hence sleeping for 200 seconds so that we get the correct status. +sleep 200 # Give the scrub some time to run before we check if it fails log_must check_all $TESTPOOL2 "SUSPENDED"