zfs/include/linux/vfs_compat.h

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (C) 2011 Lawrence Livermore National Security, LLC.
 */

#ifndef _ZFS_VFS_H
#define _ZFS_VFS_H

/*
 * 2.6.28 API change,
 * Added insert_inode_locked() helper function, prior to this most callers
 * used insert_inode_hash().  The older method doesn't check for collisions
 * in the inode_hashtable but it still acceptible for use.
 */
#ifndef HAVE_INSERT_INODE_LOCKED
static inline int
insert_inode_locked(struct inode *ip)
{
	insert_inode_hash(ip);
	return (0);
}
#endif /* HAVE_INSERT_INODE_LOCKED */

/*
 * 2.6.35 API change,
 * Add truncate_setsize() if it is not exported by the Linux kernel.
 *
 * Truncate the inode and pages associated with the inode. The pages are
 * unmapped and removed from cache.
 */
#ifndef HAVE_TRUNCATE_SETSIZE
static inline void
truncate_setsize(struct inode *ip, loff_t new)
{
	struct address_space *mapping = ip->i_mapping;

	i_size_write(ip, new);

	unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
	truncate_inode_pages(mapping, new);
	unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
}
#endif /* HAVE_TRUNCATE_SETSIZE */

#if defined(HAVE_BDI) && !defined(HAVE_BDI_SETUP_AND_REGISTER)
/*
 * 2.6.34 API change,
 * Add bdi_setup_and_register() function if not yet provided by kernel.
 * It is used to quickly initialize and register a BDI for the filesystem.
 */
extern atomic_long_t zfs_bdi_seq;

static inline int
bdi_setup_and_register(struct backing_dev_info *bdi,char *name,unsigned int cap)
{
	char tmp[32];
	int error;

	bdi->name = name;
	bdi->capabilities = cap;
	error = bdi_init(bdi);
	if (error)
		return (error);

	sprintf(tmp, "%.28s%s", name, "-%d");
	error = bdi_register(bdi, NULL, tmp,
	    atomic_long_inc_return(&zfs_bdi_seq));
	if (error) {
		bdi_destroy(bdi);
		return (error);
	}

	return (error);
}
#endif /* HAVE_BDI && !HAVE_BDI_SETUP_AND_REGISTER */

/*
 * 2.6.38 API change,
 * LOOKUP_RCU flag introduced to distinguish rcu-walk from ref-walk cases.
 */
#ifndef LOOKUP_RCU
#define LOOKUP_RCU      0x0
#endif /* LOOKUP_RCU */

/*
 * 3.2-rc1 API change,
 * Add set_nlink() if it is not exported by the Linux kernel.
 *
 * i_nlink is read-only in Linux 3.2, but it can be set directly in
 * earlier kernels.
 */
#ifndef HAVE_SET_NLINK
static inline void
set_nlink(struct inode *inode, unsigned int nlink)
{
	inode->i_nlink = nlink;
}
#endif /* HAVE_SET_NLINK */

/*
 * 3.3 API change,
 * The VFS .create, .mkdir and .mknod callbacks were updated to take a
 * umode_t type rather than an int.  To cleanly handle both definitions
 * the zpl_umode_t type is introduced and set accordingly.
 */
#ifdef HAVE_MKDIR_UMODE_T
typedef	umode_t		zpl_umode_t;
#else
typedef	int		zpl_umode_t;
#endif

/*
 * 3.5 API change,
 * The clear_inode() function replaces end_writeback() and introduces an
 * ordering change regarding when the inode_sync_wait() occurs.  See the
 * configure check in config/kernel-clear-inode.m4 for full details.
 */
#if defined(HAVE_EVICT_INODE) && !defined(HAVE_CLEAR_INODE)
#define clear_inode(ip)		end_writeback(ip)
#endif /* HAVE_EVICT_INODE && !HAVE_CLEAR_INODE */

/*
 * 3.6 API change,
 * The sget() helper function now takes the mount flags as an argument.
 */
#ifdef HAVE_5ARG_SGET
#define zpl_sget(type, cmp, set, fl, mtd)	sget(type, cmp, set, fl, mtd)
#else
#define zpl_sget(type, cmp, set, fl, mtd)	sget(type, cmp, set, mtd)
#endif /* HAVE_5ARG_SGET */

#endif /* _ZFS_VFS_H */
Linux 2.6.35 compat, fops->fsync() The fsync() callback in the file_operations structure used to take 3 arguments. The callback now only takes 2 arguments because the dentry argument was determined to be unused by all consumers. To handle this a compatibility prototype was added to ensure the right prototype is used. Our implementation never used the dentry argument either so it's just a matter of using the right prototype. 2011-02-11 16:58:55 +00:00			`/*`
			`* CDDL HEADER START`
			`*`
			`* The contents of this file are subject to the terms of the`
			`* Common Development and Distribution License (the "License").`
			`* You may not use this file except in compliance with the License.`
			`*`
			`* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE`
			`* or http://www.opensolaris.org/os/licensing.`
			`* See the License for the specific language governing permissions`
			`* and limitations under the License.`
			`*`
			`* When distributing Covered Code, include this CDDL HEADER in each`
			`* file and include the License file at usr/src/OPENSOLARIS.LICENSE.`
			`* If applicable, add the following below this CDDL HEADER, with the`
			`* fields enclosed by brackets "[]" replaced with your own identifying`
			`* information: Portions Copyright [yyyy] [name of copyright owner]`
			`*`
			`* CDDL HEADER END`
			`*/`

			`/*`
			`* Copyright (C) 2011 Lawrence Livermore National Security, LLC.`
			`*/`

			`#ifndef _ZFS_VFS_H`
			`#define _ZFS_VFS_H`

Linux 2.6.28 compat, insert_inode_locked() Added insert_inode_locked() helper function, prior to this most callers used insert_inode_hash(). The older method doesn't check for collisions in the inode_hashtable but it still acceptible for use. Fallback to using insert_inode_hash() when insert_inode_locked() is unavailable. 2011-03-22 16:55:09 +00:00			`/*`
			`* 2.6.28 API change,`
			`* Added insert_inode_locked() helper function, prior to this most callers`
			`* used insert_inode_hash(). The older method doesn't check for collisions`
			`* in the inode_hashtable but it still acceptible for use.`
			`*/`
			`#ifndef HAVE_INSERT_INODE_LOCKED`
			`static inline int`
			`insert_inode_locked(struct inode *ip)`
			`{`
			`insert_inode_hash(ip);`
			`return (0);`
			`}`
			`#endif /* HAVE_INSERT_INODE_LOCKED */`
Linux 2.6.35 compat, fops->fsync() The fsync() callback in the file_operations structure used to take 3 arguments. The callback now only takes 2 arguments because the dentry argument was determined to be unused by all consumers. To handle this a compatibility prototype was added to ensure the right prototype is used. Our implementation never used the dentry argument either so it's just a matter of using the right prototype. 2011-02-11 16:58:55 +00:00
Tear down and flush the mmap region The inode eviction should unmap the pages associated with the inode. These pages should also be flushed to disk to avoid the data loss. Therefore, use truncate_setsize() in evict_inode() to release the pagecache. The API truncate_setsize() was added in 2.6.35 kernel. To ensure compatibility with the old kernel, the patch defines its own truncate_setsize function. Signed-off-by: Prasad Joshi <pjoshi@stec-inc.com> Closes #255 2011-06-25 12:30:29 +00:00			`/*`
			`* 2.6.35 API change,`
			`* Add truncate_setsize() if it is not exported by the Linux kernel.`
			`*`
			`* Truncate the inode and pages associated with the inode. The pages are`
			`* unmapped and removed from cache.`
			`*/`
			`#ifndef HAVE_TRUNCATE_SETSIZE`
			`static inline void`
			`truncate_setsize(struct inode *ip, loff_t new)`
			`{`
			`struct address_space *mapping = ip->i_mapping;`

			`i_size_write(ip, new);`

			`unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);`
			`truncate_inode_pages(mapping, new);`
			`unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);`
			`}`
			`#endif /* HAVE_TRUNCATE_SETSIZE */`

Simplify BDI integration Update the code to use the bdi_setup_and_register() helper to simplify the bdi integration code. The updated code now just registers the bdi during mount and destroys it during unmount. The only complication is that for 2.6.32 - 2.6.33 kernels the helper wasn't available so in these cases the zfs code must provide it. Luckily the bdi_setup_and_register() function is trivial. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #367 2011-11-08 00:39:03 +00:00			`#if defined(HAVE_BDI) && !defined(HAVE_BDI_SETUP_AND_REGISTER)`
Add backing_device_info per-filesystem For a long time now the kernel has been moving away from using the pdflush daemon to write 'old' dirty pages to disk. The primary reason for this is because the pdflush daemon is single threaded and can be a limiting factor for performance. Since pdflush sequentially walks the dirty inode list for each super block any delay in processing can slow down dirty page writeback for all filesystems. The replacement for pdflush is called bdi (backing device info). The bdi system involves creating a per-filesystem control structure each with its own private sets of queues to manage writeback. The advantage is greater parallelism which improves performance and prevents a single filesystem from slowing writeback to the others. For a long time both systems co-existed in the kernel so it wasn't strictly required to implement the bdi scheme. However, as of Linux 2.6.36 kernels the pdflush functionality has been retired. Since ZFS already bypasses the page cache for most I/O this is only an issue for mmap(2) writes which must go through the page cache. Even then adding this missing support for newer kernels was overlooked because there are other mechanisms which can trigger writeback. However, there is one critical case where not implementing the bdi functionality can cause problems. If an application handles a page fault it can enter the balance_dirty_pages() callpath. This will result in the application hanging until the number of dirty pages in the system drops below the dirty ratio. Without a registered backing_device_info for the filesystem the dirty pages will not get written out. Thus the application will hang. As mentioned above this was less of an issue with older kernels because pdflush would eventually write out the dirty pages. This change adds a backing_device_info structure to the zfs_sb_t which is already allocated per-super block. It is then registered when the filesystem mounted and unregistered on unmount. It will not be registered for mounted snapshots which are read-only. This change will result in flush-<pool> thread being dynamically created and destroyed per-mounted filesystem for writeback. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #174 2011-08-02 01:24:40 +00:00			`/*`
Simplify BDI integration Update the code to use the bdi_setup_and_register() helper to simplify the bdi integration code. The updated code now just registers the bdi during mount and destroys it during unmount. The only complication is that for 2.6.32 - 2.6.33 kernels the helper wasn't available so in these cases the zfs code must provide it. Luckily the bdi_setup_and_register() function is trivial. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #367 2011-11-08 00:39:03 +00:00			`* 2.6.34 API change,`
			`* Add bdi_setup_and_register() function if not yet provided by kernel.`
			`* It is used to quickly initialize and register a BDI for the filesystem.`
Add backing_device_info per-filesystem For a long time now the kernel has been moving away from using the pdflush daemon to write 'old' dirty pages to disk. The primary reason for this is because the pdflush daemon is single threaded and can be a limiting factor for performance. Since pdflush sequentially walks the dirty inode list for each super block any delay in processing can slow down dirty page writeback for all filesystems. The replacement for pdflush is called bdi (backing device info). The bdi system involves creating a per-filesystem control structure each with its own private sets of queues to manage writeback. The advantage is greater parallelism which improves performance and prevents a single filesystem from slowing writeback to the others. For a long time both systems co-existed in the kernel so it wasn't strictly required to implement the bdi scheme. However, as of Linux 2.6.36 kernels the pdflush functionality has been retired. Since ZFS already bypasses the page cache for most I/O this is only an issue for mmap(2) writes which must go through the page cache. Even then adding this missing support for newer kernels was overlooked because there are other mechanisms which can trigger writeback. However, there is one critical case where not implementing the bdi functionality can cause problems. If an application handles a page fault it can enter the balance_dirty_pages() callpath. This will result in the application hanging until the number of dirty pages in the system drops below the dirty ratio. Without a registered backing_device_info for the filesystem the dirty pages will not get written out. Thus the application will hang. As mentioned above this was less of an issue with older kernels because pdflush would eventually write out the dirty pages. This change adds a backing_device_info structure to the zfs_sb_t which is already allocated per-super block. It is then registered when the filesystem mounted and unregistered on unmount. It will not be registered for mounted snapshots which are read-only. This change will result in flush-<pool> thread being dynamically created and destroyed per-mounted filesystem for writeback. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #174 2011-08-02 01:24:40 +00:00			`*/`
Simplify BDI integration Update the code to use the bdi_setup_and_register() helper to simplify the bdi integration code. The updated code now just registers the bdi during mount and destroys it during unmount. The only complication is that for 2.6.32 - 2.6.33 kernels the helper wasn't available so in these cases the zfs code must provide it. Luckily the bdi_setup_and_register() function is trivial. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #367 2011-11-08 00:39:03 +00:00			`extern atomic_long_t zfs_bdi_seq;`

			`static inline int`
			`bdi_setup_and_register(struct backing_dev_info bdi,char name,unsigned int cap)`
			`{`
			`char tmp[32];`
			`int error;`

			`bdi->name = name;`
			`bdi->capabilities = cap;`
			`error = bdi_init(bdi);`
			`if (error)`
			`return (error);`

			`sprintf(tmp, "%.28s%s", name, "-%d");`
			`error = bdi_register(bdi, NULL, tmp,`
			`atomic_long_inc_return(&zfs_bdi_seq));`
			`if (error) {`
			`bdi_destroy(bdi);`
			`return (error);`
			`}`

			`return (error);`
			`}`
			`#endif /* HAVE_BDI && !HAVE_BDI_SETUP_AND_REGISTER */`
Add backing_device_info per-filesystem For a long time now the kernel has been moving away from using the pdflush daemon to write 'old' dirty pages to disk. The primary reason for this is because the pdflush daemon is single threaded and can be a limiting factor for performance. Since pdflush sequentially walks the dirty inode list for each super block any delay in processing can slow down dirty page writeback for all filesystems. The replacement for pdflush is called bdi (backing device info). The bdi system involves creating a per-filesystem control structure each with its own private sets of queues to manage writeback. The advantage is greater parallelism which improves performance and prevents a single filesystem from slowing writeback to the others. For a long time both systems co-existed in the kernel so it wasn't strictly required to implement the bdi scheme. However, as of Linux 2.6.36 kernels the pdflush functionality has been retired. Since ZFS already bypasses the page cache for most I/O this is only an issue for mmap(2) writes which must go through the page cache. Even then adding this missing support for newer kernels was overlooked because there are other mechanisms which can trigger writeback. However, there is one critical case where not implementing the bdi functionality can cause problems. If an application handles a page fault it can enter the balance_dirty_pages() callpath. This will result in the application hanging until the number of dirty pages in the system drops below the dirty ratio. Without a registered backing_device_info for the filesystem the dirty pages will not get written out. Thus the application will hang. As mentioned above this was less of an issue with older kernels because pdflush would eventually write out the dirty pages. This change adds a backing_device_info structure to the zfs_sb_t which is already allocated per-super block. It is then registered when the filesystem mounted and unregistered on unmount. It will not be registered for mounted snapshots which are read-only. This change will result in flush-<pool> thread being dynamically created and destroyed per-mounted filesystem for writeback. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #174 2011-08-02 01:24:40 +00:00
Fix 'zfs rollback' on mounted file systems Rolling back a mounted filesystem with open file handles and cached dentries+inodes never worked properly in ZoL. The major issue was that Linux provides no easy mechanism for modules to invalidate the inode cache for a file system. Because of this it was possible that an inode from the previous filesystem would not get properly dropped from the cache during rolling back. Then a new inode with the same inode number would be create and collide with the existing cached inode. Ideally this would trigger an VERIFY() but in practice the error wasn't handled and it would just NULL reference. Luckily, this issue can be resolved by sprucing up the existing Solaris zfs_rezget() functionality for the Linux VFS. The way it works now is that when a file system is rolled back all the cached inodes will be traversed and refetched from disk. If a version of the cached inode exists on disk the in-core copy will be updated accordingly. If there is no match for that object on disk it will be unhashed from the inode cache and marked as stale. This will effectively make the inode unfindable for lookups allowing the inode number to be immediately recycled. The inode will then only be accessible from the cached dentries. Subsequent dentry lookups which reference a stale inode will result in the dentry being invalidated. Once invalidated the dentry will drop its reference on the inode allowing it to be safely pruned from the cache. Special care is taken for negative dentries since they do not reference any inode. These dentires will be invalidate based on when they were added to the dentry cache. Entries added before the last rollback will be invalidate to prevent them from masking real files in the dataset. Two nice side effects of this fix are: * Removes the dependency on spl_invalidate_inodes(), it can now be safely removed from the SPL when we choose to do so. * zfs_znode_alloc() no longer requires a dentry to be passed. This effectively reverts this portition of the code to its upstream counterpart. The dentry is not instantiated more correctly in the Linux ZPL layer. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Ned Bass <bass6@llnl.gov> Closes #795 2013-01-16 00:41:09 +00:00			`/*`
			`* 2.6.38 API change,`
			`* LOOKUP_RCU flag introduced to distinguish rcu-walk from ref-walk cases.`
			`*/`
			`#ifndef LOOKUP_RCU`
			`#define LOOKUP_RCU 0x0`
			`#endif /* LOOKUP_RCU */`

Linux 3.2 compat: set_nlink() Directly changing inode->i_nlink is deprecated in Linux 3.2 by commit SHA: bfe8684869601dacfcb2cd69ef8cfd9045f62170 Use the new set_nlink() kernel function instead. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes: #462 2011-12-16 21:15:12 +00:00			`/*`
			`* 3.2-rc1 API change,`
			`* Add set_nlink() if it is not exported by the Linux kernel.`
			`*`
			`* i_nlink is read-only in Linux 3.2, but it can be set directly in`
			`* earlier kernels.`
			`*/`
			`#ifndef HAVE_SET_NLINK`
			`static inline void`
			`set_nlink(struct inode *inode, unsigned int nlink)`
			`{`
			`inode->i_nlink = nlink;`
			`}`
			`#endif /* HAVE_SET_NLINK */`

Linux 3.3 compat, iops->create()/mkdir()/mknod() The mode argument of iops->create()/mkdir()/mknod() was changed from an 'int' to a 'umode_t'. To prevent a compiler warning an autoconf check was added to detect the API change and then correctly set a zpl_umode_t typedef. There is no functional change. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #701 2012-04-30 19:01:49 +00:00			`/*`
			`* 3.3 API change,`
			`* The VFS .create, .mkdir and .mknod callbacks were updated to take a`
			`* umode_t type rather than an int. To cleanly handle both definitions`
			`* the zpl_umode_t type is introduced and set accordingly.`
			`*/`
Linux 3.6 compat, iops->mkdir() Use .mkdir instead of .create in 3.3 compatibility check. Linux 3.6 modifies inode_operations->create's function prototype. This causes an autotools Linux 3.3. compatibility check for a function prototype change in create, mkdir and mknode to fail. Since mkdir and mknode are unchanged, we modify the check to examine it instead. Signed-off-by: Richard Yao <ryao@cs.stonybrook.edu> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #873 2012-08-16 23:31:54 +00:00			`#ifdef HAVE_MKDIR_UMODE_T`
Linux 3.3 compat, iops->create()/mkdir()/mknod() The mode argument of iops->create()/mkdir()/mknod() was changed from an 'int' to a 'umode_t'. To prevent a compiler warning an autoconf check was added to detect the API change and then correctly set a zpl_umode_t typedef. There is no functional change. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #701 2012-04-30 19:01:49 +00:00			`typedef umode_t zpl_umode_t;`
			`#else`
			`typedef int zpl_umode_t;`
			`#endif`

Linux 3.5 compat, end_writeback() changed to clear_inode() The end_writeback() function was changed by moving the call to inode_sync_wait() earlier in to evict(). This effecitvely changes the ordering of the sync but it does not impact the details of the zfs implementation. However, as part of this change end_writeback() was renamed to clear_inode() to reflect the new semantics. This change does impact us and clear_inode() now maps to end_writeback() for kernels prior to 3.5. Signed-off-by: Richard Yao <ryao@cs.stonybrook.edu> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #784 2012-07-23 18:39:25 +00:00			`/*`
			`* 3.5 API change,`
			`* The clear_inode() function replaces end_writeback() and introduces an`
			`* ordering change regarding when the inode_sync_wait() occurs. See the`
			`* configure check in config/kernel-clear-inode.m4 for full details.`
			`*/`
			`#if defined(HAVE_EVICT_INODE) && !defined(HAVE_CLEAR_INODE)`
			`#define clear_inode(ip) end_writeback(ip)`
			`#endif /* HAVE_EVICT_INODE && !HAVE_CLEAR_INODE */`

Linux 3.6 compat, sget() As of Linux commit 9249e17fe094d853d1ef7475dd559a2cc7e23d42 the mount flags are now passed to sget() so they can be used when initializing a new superblock. ZFS never uses sget() in this fashion so we can simply pass a zero and add a zpl_sget() compatibility wrapper. Signed-off-by: Yuxuan Shui <yshuiv7@gmail.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #873 2012-10-12 13:40:53 +00:00			`/*`
			`* 3.6 API change,`
			`* The sget() helper function now takes the mount flags as an argument.`
			`*/`
			`#ifdef HAVE_5ARG_SGET`
			`#define zpl_sget(type, cmp, set, fl, mtd) sget(type, cmp, set, fl, mtd)`
			`#else`
			`#define zpl_sget(type, cmp, set, fl, mtd) sget(type, cmp, set, mtd)`
			`#endif /* HAVE_5ARG_SGET */`

Linux 2.6.35 compat, fops->fsync() The fsync() callback in the file_operations structure used to take 3 arguments. The callback now only takes 2 arguments because the dentry argument was determined to be unused by all consumers. To handle this a compatibility prototype was added to ensure the right prototype is used. Our implementation never used the dentry argument either so it's just a matter of using the right prototype. 2011-02-11 16:58:55 +00:00			`#endif /* _ZFS_VFS_H */`