From 13312e2fa107b2a10595d67de82ecdf74db317ed Mon Sep 17 00:00:00 2001 From: Prakash Surya Date: Mon, 13 Feb 2023 16:35:59 -0800 Subject: [PATCH] Reduce need for contiguous memory for ioctls We've had cases where we trigger an OOM despite having memory freely available on the system. For example, here, we had about 21GB free: kernel: Node 0 Normal: 2418758*4kB (UME) 1549533*8kB (UE) 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 22071296kB The problem being, all the memory is in 4K and 8K contiguous regions, but the allocation request was for a 16K contiguous region: kernel: SafeExecutors-4 invoked oom-killer: gfp_mask=0x42dc0(GFP_KERNEL|__GFP_NOWARN|__GFP_COMP|__GFP_ZERO), order=2, oom_score_adj=0 The offending allocation came from this call trace: kernel: Call Trace: kernel: dump_stack+0x57/0x7a kernel: dump_header+0x4f/0x1e1 kernel: oom_kill_process.cold.33+0xb/0x10 kernel: out_of_memory+0x1ad/0x490 kernel: __alloc_pages_slowpath+0xd55/0xe40 kernel: __alloc_pages_nodemask+0x2df/0x330 kernel: kmalloc_large_node+0x42/0x90 kernel: __kmalloc_node+0x25a/0x320 kernel: ? spl_kmem_free_impl+0x21/0x30 [spl] kernel: spl_kmem_alloc_impl+0xa5/0x100 [spl] kernel: spl_kmem_zalloc+0x19/0x20 [spl] kernel: zfsdev_ioctl+0x2b/0xe0 [zfs] kernel: do_vfs_ioctl+0xa9/0x640 kernel: ? __audit_syscall_entry+0xdd/0x130 kernel: ksys_ioctl+0x67/0x90 kernel: __x64_sys_ioctl+0x1a/0x20 kernel: do_syscall_64+0x5e/0x200 kernel: entry_SYSCALL_64_after_hwframe+0x44/0xa9 kernel: RIP: 0033:0x7fdca3674317 The problem is, for each ioctl that ZFS makes, it has to allocate a zfs_cmd_t structure, which is 13744 bytes in size (on my system): sdb> sizeof zfs_cmd (size_t)13744 This size, coupled with the fact that we currently allocate it with kmem_zalloc, means we need a 16K contiguous region of memory to satisfy the request. The solution taken by this change, is to use "vmem" instead of "kmem" to do the allocation, such that we don't necessarily need a contiguous 16K memory region to satisfy the allocation. Arguably, a better solution would be not to require such a large allocation to begin with (e.g. reduce the size of the zfs_cmd_t structure), but that'd be a much larger change than this "one liner". Thus, I've opted for this approach for now; we can always circle back and attempt to reduce the size of the structure in the future. Reviewed-by: Matthew Ahrens Reviewed-by: Brian Behlendorf Reviewed-by: Richard Yao Reviewed-by: Mark Maybee Reviewed-by: Don Brady Signed-off-by: Prakash Surya Closes #14474 --- module/os/freebsd/zfs/kmod_core.c | 10 +++++----- module/os/linux/zfs/zfs_ioctl_os.c | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/module/os/freebsd/zfs/kmod_core.c b/module/os/freebsd/zfs/kmod_core.c index a1b1c8e4e6..f4c87013db 100644 --- a/module/os/freebsd/zfs/kmod_core.c +++ b/module/os/freebsd/zfs/kmod_core.c @@ -142,7 +142,7 @@ zfsdev_ioctl(struct cdev *dev, ulong_t zcmd, caddr_t arg, int flag, return (EINVAL); uaddr = (void *)zp->zfs_cmd; - zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); + zc = vmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); #ifdef ZFS_LEGACY_SUPPORT /* * Remap ioctl code for legacy user binaries @@ -150,10 +150,10 @@ zfsdev_ioctl(struct cdev *dev, ulong_t zcmd, caddr_t arg, int flag, if (zp->zfs_ioctl_version == ZFS_IOCVER_LEGACY) { vecnum = zfs_ioctl_legacy_to_ozfs(vecnum); if (vecnum < 0) { - kmem_free(zc, sizeof (zfs_cmd_t)); + vmem_free(zc, sizeof (zfs_cmd_t)); return (ENOTSUP); } - zcl = kmem_zalloc(sizeof (zfs_cmd_legacy_t), KM_SLEEP); + zcl = vmem_zalloc(sizeof (zfs_cmd_legacy_t), KM_SLEEP); if (copyin(uaddr, zcl, sizeof (zfs_cmd_legacy_t))) { error = SET_ERROR(EFAULT); goto out; @@ -180,9 +180,9 @@ zfsdev_ioctl(struct cdev *dev, ulong_t zcmd, caddr_t arg, int flag, out: #ifdef ZFS_LEGACY_SUPPORT if (zcl) - kmem_free(zcl, sizeof (zfs_cmd_legacy_t)); + vmem_free(zcl, sizeof (zfs_cmd_legacy_t)); #endif - kmem_free(zc, sizeof (zfs_cmd_t)); + vmem_free(zc, sizeof (zfs_cmd_t)); MPASS(tsd_get(rrw_tsd_key) == NULL); return (error); } diff --git a/module/os/linux/zfs/zfs_ioctl_os.c b/module/os/linux/zfs/zfs_ioctl_os.c index f8d1777b07..f068f544f0 100644 --- a/module/os/linux/zfs/zfs_ioctl_os.c +++ b/module/os/linux/zfs/zfs_ioctl_os.c @@ -135,7 +135,7 @@ zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg) vecnum = cmd - ZFS_IOC_FIRST; - zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); + zc = vmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); if (ddi_copyin((void *)(uintptr_t)arg, zc, sizeof (zfs_cmd_t), 0)) { error = -SET_ERROR(EFAULT); @@ -146,7 +146,7 @@ zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg) if (error == 0 && rc != 0) error = -SET_ERROR(EFAULT); out: - kmem_free(zc, sizeof (zfs_cmd_t)); + vmem_free(zc, sizeof (zfs_cmd_t)); return (error); }