From fac84846a28c0950d4433118b3dffd44306df62d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 15 Nov 2024 10:30:27 -0500 Subject: fanotify: disable readahead if we have pre-content watches With page faults we can trigger readahead on the file, and then subsequent faults can find these pages and insert them into the file without emitting an fanotify event. To avoid this case, disable readahead if we have pre-content watches on the file. This way we are guaranteed to get an event for every range we attempt to access on a pre-content watched file. Reviewed-by: Christian Brauner Signed-off-by: Josef Bacik Signed-off-by: Jan Kara Link: https://patch.msgid.link/70a54e859f555e54bc7a47b32fe5aca92b085615.1731684329.git.josef@toxicpanda.com --- mm/filemap.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 7c76a123ba18..e9a0f330d33e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3150,6 +3150,14 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) unsigned long vm_flags = vmf->vma->vm_flags; unsigned int mmap_miss; + /* + * If we have pre-content watches we need to disable readahead to make + * sure that we don't populate our mapping with 0 filled pages that we + * never emitted an event for. + */ + if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) + return fpin; + #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* Use the readahead code, even if readahead is disabled */ if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) { @@ -3218,6 +3226,10 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, struct file *fpin = NULL; unsigned int mmap_miss; + /* See comment in do_sync_mmap_readahead. */ + if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) + return fpin; + /* If we don't want any read-ahead, don't bother */ if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages) return fpin; -- cgit v1.2.3 From 8392bc2ff8c8bf7c4c5e6dfa71ccd893a3c046f6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 15 Nov 2024 10:30:29 -0500 Subject: fsnotify: generate pre-content permission event on page fault FS_PRE_ACCESS will be generated on page fault depending on the faulting method. This pre-content event is meant to be used by hierarchical storage managers that want to fill in the file content on first read access. Export a simple helper that file systems that have their own ->fault() will use, and have a more complicated helper to be do fancy things in filemap_fault. Signed-off-by: Josef Bacik Signed-off-by: Jan Kara Link: https://patch.msgid.link/aa56c50ce81b1fd18d7f5d71dd2dfced5eba9687.1731684329.git.josef@toxicpanda.com --- include/linux/mm.h | 1 + mm/filemap.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/nommu.c | 7 ++++++ 3 files changed, 82 insertions(+) (limited to 'mm/filemap.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index c39c4945946c..e6c3c9cbcfe5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3420,6 +3420,7 @@ extern vm_fault_t filemap_fault(struct vm_fault *vmf); extern vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); +extern vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf); extern unsigned long stack_guard_gap; /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ diff --git a/mm/filemap.c b/mm/filemap.c index e9a0f330d33e..6fdd5dc093c0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include "internal.h" @@ -3288,6 +3289,48 @@ static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf) return ret; } +/** + * filemap_fsnotify_fault - maybe emit a pre-content event. + * @vmf: struct vm_fault containing details of the fault. + * + * If we have a pre-content watch on this file we will emit an event for this + * range. If we return anything the fault caller should return immediately, we + * will return VM_FAULT_RETRY if we had to emit an event, which will trigger the + * fault again and then the fault handler will run the second time through. + * + * Return: a bitwise-OR of %VM_FAULT_ codes, 0 if nothing happened. + */ +vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf) +{ + struct file *fpin = NULL; + int mask = (vmf->flags & FAULT_FLAG_WRITE) ? MAY_WRITE : MAY_ACCESS; + loff_t pos = vmf->pgoff >> PAGE_SHIFT; + size_t count = PAGE_SIZE; + int err; + + /* + * We already did this and now we're retrying with everything locked, + * don't emit the event and continue. + */ + if (vmf->flags & FAULT_FLAG_TRIED) + return 0; + + /* No watches, we're done. */ + if (likely(!FMODE_FSNOTIFY_HSM(vmf->vma->vm_file->f_mode))) + return 0; + + fpin = maybe_unlock_mmap_for_io(vmf, fpin); + if (!fpin) + return VM_FAULT_SIGBUS; + + err = fsnotify_file_area_perm(fpin, mask, &pos, count); + fput(fpin); + if (err) + return VM_FAULT_SIGBUS; + return VM_FAULT_RETRY; +} +EXPORT_SYMBOL_GPL(filemap_fsnotify_fault); + /** * filemap_fault - read in file data for page fault handling * @vmf: struct vm_fault containing details of the fault @@ -3391,6 +3434,37 @@ retry_find: * or because readahead was otherwise unable to retrieve it. */ if (unlikely(!folio_test_uptodate(folio))) { + /* + * If this is a precontent file we have can now emit an event to + * try and populate the folio. + */ + if (!(vmf->flags & FAULT_FLAG_TRIED) && + unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) { + loff_t pos = folio_pos(folio); + size_t count = folio_size(folio); + + /* We're NOWAIT, we have to retry. */ + if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) { + folio_unlock(folio); + goto out_retry; + } + + if (mapping_locked) + filemap_invalidate_unlock_shared(mapping); + mapping_locked = false; + + folio_unlock(folio); + fpin = maybe_unlock_mmap_for_io(vmf, fpin); + if (!fpin) + goto out_retry; + + error = fsnotify_file_area_perm(fpin, MAY_ACCESS, &pos, + count); + if (error) + ret = VM_FAULT_SIGBUS; + goto out_retry; + } + /* * If the invalidate lock is not held, the folio was in cache * and uptodate and now it is not. Strange but possible since we diff --git a/mm/nommu.c b/mm/nommu.c index 9cb6e99215e2..baa79abdaf03 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1613,6 +1613,13 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, } EXPORT_SYMBOL(remap_vmalloc_range); +vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf) +{ + BUG(); + return 0; +} +EXPORT_SYMBOL_GPL(filemap_fsnotify_fault); + vm_fault_t filemap_fault(struct vm_fault *vmf) { BUG(); -- cgit v1.2.3 From 62e72d2cf702a5e2fb53d9c46ed900d9384e4a06 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sun, 22 Dec 2024 20:29:36 +0800 Subject: mm, madvise: fix potential workingset node list_lru leaks Since commit 5abc1e37afa0 ("mm: list_lru: allocate list_lru_one only when needed"), all list_lru users need to allocate the items using the new infrastructure that provides list_lru info for slab allocation, ensuring that the corresponding memcg list_lru is allocated before use. For workingset shadow nodes (which are xa_node), users are converted to use the new infrastructure by commit 9bbdc0f32409 ("xarray: use kmem_cache_alloc_lru to allocate xa_node"). The xas->xa_lru will be set correctly for filemap users. However, there is a missing case: xa_node allocations caused by madvise(..., MADV_COLLAPSE). madvise(..., MADV_COLLAPSE) will also read in the absent parts of file map, and there will be xa_nodes allocated for the caller's memcg (assuming it's not rootcg). However, these allocations won't trigger memcg list_lru allocation because the proper xas info was not set. If nothing else has allocated other xa_nodes for that memcg to trigger list_lru creation, and memory pressure starts to evict file pages, workingset_update_node will try to add these xa_nodes to their corresponding memcg list_lru, and it does not exist (NULL). So they will be added to rootcg's list_lru instead. This shouldn't be a significant issue in practice, but it is indeed unexpected behavior, and these xa_nodes will not be reclaimed effectively. And may lead to incorrect counting of the list_lru->nr_items counter. This problem wasn't exposed until recent commit 28e98022b31ef ("mm/list_lru: simplify reparenting and initial allocation") added a sanity check: only dying memcg could have a NULL list_lru when list_lru_{add,del} is called. This problem triggered this WARNING. So make madvise(..., MADV_COLLAPSE) also call xas_set_lru() to pass the list_lru which we may want to insert xa_node into later. And move mapping_set_update to mm/internal.h, and turn into a macro to avoid including extra headers in mm/internal.h. Link: https://lkml.kernel.org/r/20241222122936.67501-1-ryncsn@gmail.com Fixes: 9bbdc0f32409 ("xarray: use kmem_cache_alloc_lru to allocate xa_node") Reported-by: syzbot+38a0cbd267eff2d286ff@syzkaller.appspotmail.com Closes: https://lore.kernel.org/lkml/675d01e9.050a0220.37aaf.00be.GAE@google.com/ Signed-off-by: Kairui Song Cc: Chengming Zhou Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: Sasha Levin Cc: Shakeel Butt Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/filemap.c | 9 --------- mm/internal.h | 6 ++++++ mm/khugepaged.c | 3 +++ 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index f61cf51c2238..33b60d448fca 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -124,15 +124,6 @@ * ->private_lock (zap_pte_range->block_dirty_folio) */ -static void mapping_set_update(struct xa_state *xas, - struct address_space *mapping) -{ - if (dax_mapping(mapping) || shmem_mapping(mapping)) - return; - xas_set_update(xas, workingset_update_node); - xas_set_lru(xas, &shadow_nodes); -} - static void page_cache_delete(struct address_space *mapping, struct folio *folio, void *shadow) { diff --git a/mm/internal.h b/mm/internal.h index 3bd08bafad04..9826f7dce607 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1504,6 +1504,12 @@ static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry, /* Only track the nodes of mappings with shadow entries */ void workingset_update_node(struct xa_node *node); extern struct list_lru shadow_nodes; +#define mapping_set_update(xas, mapping) do { \ + if (!dax_mapping(mapping) && !shmem_mapping(mapping)) { \ + xas_set_update(xas, workingset_update_node); \ + xas_set_lru(xas, &shadow_nodes); \ + } \ +} while (0) /* mremap.c */ unsigned long move_page_tables(struct vm_area_struct *vma, diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6f8d46d107b4..653dbb1ff05c 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -1837,6 +1838,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, if (result != SCAN_SUCCEED) goto out; + mapping_set_update(&xas, mapping); + __folio_set_locked(new_folio); if (is_shmem) __folio_set_swapbacked(new_folio); -- cgit v1.2.3 From f505e6c91e7a22d10316665a86d79f84d9f0ba76 Mon Sep 17 00:00:00 2001 From: Marco Nelissen Date: Thu, 2 Jan 2025 11:04:11 -0800 Subject: filemap: avoid truncating 64-bit offset to 32 bits On 32-bit kernels, folio_seek_hole_data() was inadvertently truncating a 64-bit value to 32 bits, leading to a possible infinite loop when writing to an xfs filesystem. Link: https://lkml.kernel.org/r/20250102190540.1356838-1-marco.nelissen@gmail.com Fixes: 54fa39ac2e00 ("iomap: use mapping_seek_hole_data") Signed-off-by: Marco Nelissen Cc: Matthew Wilcox (Oracle) Cc: Signed-off-by: Andrew Morton --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 33b60d448fca..118fa1e0bafe 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2996,7 +2996,7 @@ static inline loff_t folio_seek_hole_data(struct xa_state *xas, if (ops->is_partially_uptodate(folio, offset, bsz) == seek_data) break; - start = (start + bsz) & ~(bsz - 1); + start = (start + bsz) & ~((u64)bsz - 1); offset += bsz; } while (offset < folio_size(folio)); unlock: -- cgit v1.2.3 From 1c47c57818ad73d2d09ddbcb4839708aab5ff2e3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 10 Jan 2025 16:32:57 +0000 Subject: mm: fix assertion in folio_end_read() We only need to assert that the uptodate flag is clear if we're going to set it. This hasn't been a problem before now because we have only used folio_end_read() when completing with an error, but it's convenient to use it in squashfs if we discover the folio is already uptodate. Link: https://lkml.kernel.org/r/20250110163300.3346321-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Phillip Lougher Signed-off-by: Andrew Morton --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 118fa1e0bafe..4f476411a9a2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1523,7 +1523,7 @@ void folio_end_read(struct folio *folio, bool success) /* Must be in bottom byte for x86 to work */ BUILD_BUG_ON(PG_uptodate > 7); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio); + VM_BUG_ON_FOLIO(success && folio_test_uptodate(folio), folio); if (likely(success)) mask |= 1 << PG_uptodate; -- cgit v1.2.3 From 1168b2bec7660f5146de7b14c14b52417e900d18 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 16 Nov 2024 15:14:46 +0000 Subject: filemap: remove unused folio_add_wait_queue folio_add_wait_queue() has been unused since 2021's commit 850cba069c26 ("cachefiles: Delete the cachefiles driver pending rewrite") Remove it. Link: https://lkml.kernel.org/r/20241116151446.95555-1-linux@treblig.org Signed-off-by: Dr. David Alan Gilbert Reviewed-by: David Hildenbrand Reviewed-by: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 5 ----- mm/filemap.c | 19 ------------------- 2 files changed, 24 deletions(-) (limited to 'mm/filemap.c') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index d796c8a33647..fc2e1319c7bb 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1280,11 +1280,6 @@ void folio_end_private_2(struct folio *folio); void folio_wait_private_2(struct folio *folio); int folio_wait_private_2_killable(struct folio *folio); -/* - * Add an arbitrary waiter to a page's wait queue - */ -void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter); - /* * Fault in userspace address range. */ diff --git a/mm/filemap.c b/mm/filemap.c index 4f476411a9a2..b6494d2d3bc2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1463,25 +1463,6 @@ static int folio_put_wait_locked(struct folio *folio, int state) return folio_wait_bit_common(folio, PG_locked, state, DROP); } -/** - * folio_add_wait_queue - Add an arbitrary waiter to a folio's wait queue - * @folio: Folio defining the wait queue of interest - * @waiter: Waiter to add to the queue - * - * Add an arbitrary @waiter to the wait queue for the nominated @folio. - */ -void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter) -{ - wait_queue_head_t *q = folio_waitqueue(folio); - unsigned long flags; - - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue_entry_tail(q, waiter); - folio_set_waiters(folio); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL_GPL(folio_add_wait_queue); - /** * folio_unlock - Unlock a locked folio. * @folio: The folio. -- cgit v1.2.3 From 5f537664e705b0bf8b7e329861f20128534f6a83 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 21 Jan 2025 09:27:22 -0800 Subject: cachestat: fix page cache statistics permission checking When the 'cachestat()' system call was added in commit cf264e1329fb ("cachestat: implement cachestat syscall"), it was meant to be a much more convenient (and performant) version of mincore() that didn't need mapping things into the user virtual address space in order to work. But it ended up missing the "check for writability or ownership" fix for mincore(), done in commit 134fca9063ad ("mm/mincore.c: make mincore() more conservative"). This just adds equivalent logic to 'cachestat()', modified for the file context (rather than vma). Reported-by: Sudheendra Raghav Neela Fixes: cf264e1329fb ("cachestat: implement cachestat syscall") Tested-by: Johannes Weiner Acked-by: Johannes Weiner Acked-by: Nhat Pham Signed-off-by: Linus Torvalds --- mm/filemap.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 4f476411a9a2..440922a7d8f1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -4375,6 +4375,20 @@ resched: rcu_read_unlock(); } +/* + * See mincore: reveal pagecache information only for files + * that the calling process has write access to, or could (if + * tried) open for writing. + */ +static inline bool can_do_cachestat(struct file *f) +{ + if (f->f_mode & FMODE_WRITE) + return true; + if (inode_owner_or_capable(file_mnt_idmap(f), file_inode(f))) + return true; + return file_permission(f, MAY_WRITE) == 0; +} + /* * The cachestat(2) system call. * @@ -4430,6 +4444,9 @@ SYSCALL_DEFINE4(cachestat, unsigned int, fd, if (is_file_hugepages(fd_file(f))) return -EOPNOTSUPP; + if (!can_do_cachestat(fd_file(f))) + return -EPERM; + if (flags != 0) return -EINVAL; -- cgit v1.2.3 From 9ad6344568cc31ede9741795b3e3c41c21e3156f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:39 -0700 Subject: mm/filemap: change filemap_create_folio() to take a struct kiocb Patch series "Uncached buffered IO", v8. 5 years ago I posted patches adding support for RWF_UNCACHED, as a way to do buffered IO that isn't page cache persistent. The approach back then was to have private pages for IO, and then get rid of them once IO was done. But that then runs into all the issues that O_DIRECT has, in terms of synchronizing with the page cache. So here's a new approach to the same concent, but using the page cache as synchronization. Due to excessive bike shedding on the naming, this is now named RWF_DONTCACHE, and is less special in that it's just page cache IO, except it prunes the ranges once IO is completed. Why do this, you may ask? The tldr is that device speeds are only getting faster, while reclaim is not. Doing normal buffered IO can be very unpredictable, and suck up a lot of resources on the reclaim side. This leads people to use O_DIRECT as a work-around, which has its own set of restrictions in terms of size, offset, and length of IO. It's also inherently synchronous, and now you need async IO as well. While the latter isn't necessarily a big problem as we have good options available there, it also should not be a requirement when all you want to do is read or write some data without caching. Even on desktop type systems, a normal NVMe device can fill the entire page cache in seconds. On the big system I used for testing, there's a lot more RAM, but also a lot more devices. As can be seen in some of the results in the following patches, you can still fill RAM in seconds even when there's 1TB of it. Hence this problem isn't solely a "big hyperscaler system" issue, it's common across the board. Common for both reads and writes with RWF_DONTCACHE is that they use the page cache for IO. Reads work just like a normal buffered read would, with the only exception being that the touched ranges will get pruned after data has been copied. For writes, the ranges will get writeback kicked off before the syscall returns, and then writeback completion will prune the range. Hence writes aren't synchronous, and it's easy to pipeline writes using RWF_DONTCACHE. Folios that aren't instantiated by RWF_DONTCACHE IO are left untouched. This means you that uncached IO will take advantage of the page cache for uptodate data, but not leave anything it instantiated/created in cache. File systems need to support this. This patchset adds support for the generic read path, which covers file systems like ext4. Patches exist to add support for iomap/XFS and btrfs as well, which sit on top of this series. If RWF_DONTCACHE IO is attempted on a file system that doesn't support it, -EOPNOTSUPP is returned. Hence the user can rely on it either working as designed, or flagging and error if that's not the case. The intent here is to give the application a sensible fallback path - eg, it may fall back to O_DIRECT if appropriate, or just live with the fact that uncached IO isn't available and do normal buffered IO. Adding "support" to other file systems should be trivial, most of the time just a one-liner adding FOP_DONTCACHE to the fop_flags in the file_operations struct, if the file system is using either iomap or the generic filemap helpers for reading and writing. Performance results are in patch 8 for reads, and you can find the write side results in the XFS patch adding support for DONTCACHE writes for XFS: https://git.kernel.dk/cgit/linux/commit/?h=buffered-uncached-fs.10&id=257e92de795fdff7d7e256501e024fac6da6a7f4 with the tldr being that I see about a 65% improvement in performance for both, with fully predictable IO times. CPU reduction is substantial as well, with no kswapd activity at all for reclaim when using uncached IO. Using it from applications is trivial - just set RWF_DONTCACHE for the read or write, using pwritev2(2) or preadv2(2). For io_uring, same thing, just set RWF_DONTCACHE in sqe->rw_flags for a buffered read/write operation. And that's it. Patches 1..7 are just prep patches, and should have no functional changes at all. Patch 8 adds support for the filemap path for RWF_DONTCACHE reads, and patches 9..12 are just prep patches for supporting the write side of uncached writes. In the below mentioned branch, there are then patches to adopt uncached reads and writes for xfs, btrfs, and ext4. The latter currently relies on bit of a hack for passing whether this is an uncached write or not through ->write_begin(), which can hopefully go away once ext4 adopts iomap for buffered writes. I say this is a hack as it's not the prettiest way to do it, however it is fully solid and will work just fine. Passes full xfstests and fsx overnight runs, no issues observed. That includes the vm running the testing also using RWF_DONTCACHE on the host. I'll post fsstress and fsx patches for RWF_DONTCACHE separately. As far as I'm concerned, no further work needs doing here. And git tree for the patches is here: https://git.kernel.dk/cgit/linux/log/?h=buffered-uncached.10 with the file system patches on top adding support for xfs/btrfs/ext4 here: https://git.kernel.dk/cgit/linux/log/?h=buffered-uncached-fs.10 This patch (of 12): Rather than pass in both the file and position directly from the kiocb, just take a struct kiocb instead. With the kiocb being passed in, skip passing in the address_space separately as well. While doing so, move the ki_flags checking into filemap_create_folio() as well. In preparation for actually needing the kiocb in the function. No functional changes in this patch. Link: https://lkml.kernel.org/r/20241220154831.1086649-1-axboe@kernel.dk Link: https://lkml.kernel.org/r/20241220154831.1086649-2-axboe@kernel.dk Signed-off-by: Jens Axboe Reviewed-by: Kirill A. Shutemov Reviewed-by: Matthew Wilcox (Oracle) Cc: Brian Foster Cc: Chris Mason Cc: Johannes Weiner Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/filemap.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index b6494d2d3bc2..904d8fa2bfc0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2431,15 +2431,17 @@ unlock_mapping: return error; } -static int filemap_create_folio(struct file *file, - struct address_space *mapping, loff_t pos, - struct folio_batch *fbatch) +static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch) { + struct address_space *mapping = iocb->ki_filp->f_mapping; struct folio *folio; int error; unsigned int min_order = mapping_min_folio_order(mapping); pgoff_t index; + if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) + return -EAGAIN; + folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order); if (!folio) return -ENOMEM; @@ -2458,7 +2460,7 @@ static int filemap_create_folio(struct file *file, * well to keep locking rules simple. */ filemap_invalidate_lock_shared(mapping); - index = (pos >> (PAGE_SHIFT + min_order)) << min_order; + index = (iocb->ki_pos >> (PAGE_SHIFT + min_order)) << min_order; error = filemap_add_folio(mapping, folio, index, mapping_gfp_constraint(mapping, GFP_KERNEL)); if (error == -EEXIST) @@ -2466,7 +2468,8 @@ static int filemap_create_folio(struct file *file, if (error) goto error; - error = filemap_read_folio(file, mapping->a_ops->read_folio, folio); + error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio, + folio); if (error) goto error; @@ -2522,9 +2525,7 @@ retry: filemap_get_read_batch(mapping, index, last_index - 1, fbatch); } if (!folio_batch_count(fbatch)) { - if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) - return -EAGAIN; - err = filemap_create_folio(filp, mapping, iocb->ki_pos, fbatch); + err = filemap_create_folio(iocb, fbatch); if (err == AOP_TRUNCATED_PAGE) goto retry; return err; -- cgit v1.2.3 From f598cdaafc370a797ae883d370a7c18c1ffc43ef Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:40 -0700 Subject: mm/filemap: use page_cache_sync_ra() to kick off read-ahead Rather than use the page_cache_sync_readahead() helper, define our own ractl and use page_cache_sync_ra() directly. In preparation for needing to modify ractl inside filemap_get_pages(). No functional changes in this patch. Link: https://lkml.kernel.org/r/20241220154831.1086649-3-axboe@kernel.dk Signed-off-by: Jens Axboe Reviewed-by: Kirill A. Shutemov Reviewed-by: Christoph Hellwig Reviewed-by: Matthew Wilcox (Oracle) Cc: Brian Foster Cc: Chris Mason Cc: Johannes Weiner Signed-off-by: Andrew Morton --- mm/filemap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 904d8fa2bfc0..a1fda00aa6bc 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2499,7 +2499,6 @@ static int filemap_get_pages(struct kiocb *iocb, size_t count, { struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; - struct file_ra_state *ra = &filp->f_ra; pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; pgoff_t last_index; struct folio *folio; @@ -2514,12 +2513,13 @@ retry: filemap_get_read_batch(mapping, index, last_index - 1, fbatch); if (!folio_batch_count(fbatch)) { + DEFINE_READAHEAD(ractl, filp, &filp->f_ra, mapping, index); + if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; if (iocb->ki_flags & IOCB_NOWAIT) flags = memalloc_noio_save(); - page_cache_sync_readahead(mapping, ra, filp, index, - last_index - index); + page_cache_sync_ra(&ractl, last_index - index); if (iocb->ki_flags & IOCB_NOWAIT) memalloc_noio_restore(flags); filemap_get_read_batch(mapping, index, last_index - 1, fbatch); -- cgit v1.2.3 From 8026e49bff9b151609da4cae20e9da7f1833dde6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:46 -0700 Subject: mm/filemap: add read support for RWF_DONTCACHE Add RWF_DONTCACHE as a read operation flag, which means that any data read wil be removed from the page cache upon completion. Uses the page cache to synchronize, and simply prunes folios that were instantiated when the operation completes. While it would be possible to use private pages for this, using the page cache as synchronization is handy for a variety of reasons: 1) No special truncate magic is needed 2) Async buffered reads need some place to serialize, using the page cache is a lot easier than writing extra code for this 3) The pruning cost is pretty reasonable and the code to support this is much simpler as a result. You can think of uncached buffered IO as being the much more attractive cousin of O_DIRECT - it has none of the restrictions of O_DIRECT. Yes, it will copy the data, but unlike regular buffered IO, it doesn't run into the unpredictability of the page cache in terms of reclaim. As an example, on a test box with 32 drives, reading them with buffered IO looks as follows: Reading bs 65536, uncached 0 1s: 145945MB/sec 2s: 158067MB/sec 3s: 157007MB/sec 4s: 148622MB/sec 5s: 118824MB/sec 6s: 70494MB/sec 7s: 41754MB/sec 8s: 90811MB/sec 9s: 92204MB/sec 10s: 95178MB/sec 11s: 95488MB/sec 12s: 95552MB/sec 13s: 96275MB/sec where it's quite easy to see where the page cache filled up, and performance went from good to erratic, and finally settles at a much lower rate. Looking at top while this is ongoing, we see: PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 7535 root 20 0 267004 0 0 S 3199 0.0 8:40.65 uncached 3326 root 20 0 0 0 0 R 100.0 0.0 0:16.40 kswapd4 3327 root 20 0 0 0 0 R 100.0 0.0 0:17.22 kswapd5 3328 root 20 0 0 0 0 R 100.0 0.0 0:13.29 kswapd6 3332 root 20 0 0 0 0 R 100.0 0.0 0:11.11 kswapd10 3339 root 20 0 0 0 0 R 100.0 0.0 0:16.25 kswapd17 3348 root 20 0 0 0 0 R 100.0 0.0 0:16.40 kswapd26 3343 root 20 0 0 0 0 R 100.0 0.0 0:16.30 kswapd21 3344 root 20 0 0 0 0 R 100.0 0.0 0:11.92 kswapd22 3349 root 20 0 0 0 0 R 100.0 0.0 0:16.28 kswapd27 3352 root 20 0 0 0 0 R 99.7 0.0 0:11.89 kswapd30 3353 root 20 0 0 0 0 R 96.7 0.0 0:16.04 kswapd31 3329 root 20 0 0 0 0 R 96.4 0.0 0:11.41 kswapd7 3345 root 20 0 0 0 0 R 96.4 0.0 0:13.40 kswapd23 3330 root 20 0 0 0 0 S 91.1 0.0 0:08.28 kswapd8 3350 root 20 0 0 0 0 S 86.8 0.0 0:11.13 kswapd28 3325 root 20 0 0 0 0 S 76.3 0.0 0:07.43 kswapd3 3341 root 20 0 0 0 0 S 74.7 0.0 0:08.85 kswapd19 3334 root 20 0 0 0 0 S 71.7 0.0 0:10.04 kswapd12 3351 root 20 0 0 0 0 R 60.5 0.0 0:09.59 kswapd29 3323 root 20 0 0 0 0 R 57.6 0.0 0:11.50 kswapd1 [...] which is just showing a partial list of the 32 kswapd threads that are running mostly full tilt, burning ~28 full CPU cores. If the same test case is run with RWF_DONTCACHE set for the buffered read, the output looks as follows: Reading bs 65536, uncached 0 1s: 153144MB/sec 2s: 156760MB/sec 3s: 158110MB/sec 4s: 158009MB/sec 5s: 158043MB/sec 6s: 157638MB/sec 7s: 157999MB/sec 8s: 158024MB/sec 9s: 157764MB/sec 10s: 157477MB/sec 11s: 157417MB/sec 12s: 157455MB/sec 13s: 157233MB/sec 14s: 156692MB/sec which is just chugging along at ~155GB/sec of read performance. Looking at top, we see: PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 7961 root 20 0 267004 0 0 S 3180 0.0 5:37.95 uncached 8024 axboe 20 0 14292 4096 0 R 1.0 0.0 0:00.13 top where just the test app is using CPU, no reclaim is taking place outside of the main thread. Not only is performance 65% better, it's also using half the CPU to do it. Link: https://lkml.kernel.org/r/20241220154831.1086649-9-axboe@kernel.dk Signed-off-by: Jens Axboe Cc: Brian Foster Cc: Chris Mason Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/filemap.c | 28 ++++++++++++++++++++++++++-- mm/swap.c | 2 ++ 2 files changed, 28 insertions(+), 2 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index a1fda00aa6bc..9eade935a48c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2445,6 +2445,8 @@ static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch) folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order); if (!folio) return -ENOMEM; + if (iocb->ki_flags & IOCB_DONTCACHE) + __folio_set_dropbehind(folio); /* * Protect against truncate / hole punch. Grabbing invalidate_lock @@ -2490,6 +2492,8 @@ static int filemap_readahead(struct kiocb *iocb, struct file *file, if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; + if (iocb->ki_flags & IOCB_DONTCACHE) + ractl.dropbehind = 1; page_cache_async_ra(&ractl, folio, last_index - folio->index); return 0; } @@ -2519,6 +2523,8 @@ retry: return -EAGAIN; if (iocb->ki_flags & IOCB_NOWAIT) flags = memalloc_noio_save(); + if (iocb->ki_flags & IOCB_DONTCACHE) + ractl.dropbehind = 1; page_cache_sync_ra(&ractl, last_index - index); if (iocb->ki_flags & IOCB_NOWAIT) memalloc_noio_restore(flags); @@ -2566,6 +2572,20 @@ static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio) return (pos1 >> shift == pos2 >> shift); } +static void filemap_end_dropbehind_read(struct address_space *mapping, + struct folio *folio) +{ + if (!folio_test_dropbehind(folio)) + return; + if (folio_test_writeback(folio) || folio_test_dirty(folio)) + return; + if (folio_trylock(folio)) { + if (folio_test_clear_dropbehind(folio)) + folio_unmap_invalidate(mapping, folio, 0); + folio_unlock(folio); + } +} + /** * filemap_read - Read data from the page cache. * @iocb: The iocb to read. @@ -2679,8 +2699,12 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, } } put_folios: - for (i = 0; i < folio_batch_count(&fbatch); i++) - folio_put(fbatch.folios[i]); + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + + filemap_end_dropbehind_read(mapping, folio); + folio_put(folio); + } folio_batch_init(&fbatch); } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); diff --git a/mm/swap.c b/mm/swap.c index 746a5ceba42c..fc8281ef4241 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -448,6 +448,8 @@ static bool lru_gen_clear_refs(struct folio *folio) */ void folio_mark_accessed(struct folio *folio) { + if (folio_test_dropbehind(folio)) + return; if (lru_gen_enabled()) { lru_gen_inc_refs(folio); return; -- cgit v1.2.3 From fb7d3bc4149395c1ae99029c852eab6c28fc3c88 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:47 -0700 Subject: mm/filemap: drop streaming/uncached pages when writeback completes If the folio is marked as streaming, drop pages when writeback completes. Intended to be used with RWF_DONTCACHE, to avoid needing sync writes for uncached IO. Link: https://lkml.kernel.org/r/20241220154831.1086649-10-axboe@kernel.dk Signed-off-by: Jens Axboe Cc: Brian Foster Cc: Chris Mason Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/filemap.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 9eade935a48c..fb17b573ae51 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1571,6 +1571,27 @@ int folio_wait_private_2_killable(struct folio *folio) } EXPORT_SYMBOL(folio_wait_private_2_killable); +/* + * If folio was marked as dropbehind, then pages should be dropped when writeback + * completes. Do that now. If we fail, it's likely because of a big folio - + * just reset dropbehind for that case and latter completions should invalidate. + */ +static void folio_end_dropbehind_write(struct folio *folio) +{ + /* + * Hitting !in_task() should not happen off RWF_DONTCACHE writeback, + * but can happen if normal writeback just happens to find dirty folios + * that were created as part of uncached writeback, and that writeback + * would otherwise not need non-IRQ handling. Just skip the + * invalidation in that case. + */ + if (in_task() && folio_trylock(folio)) { + if (folio->mapping) + folio_unmap_invalidate(folio->mapping, folio, 0); + folio_unlock(folio); + } +} + /** * folio_end_writeback - End writeback against a folio. * @folio: The folio. @@ -1581,6 +1602,8 @@ EXPORT_SYMBOL(folio_wait_private_2_killable); */ void folio_end_writeback(struct folio *folio) { + bool folio_dropbehind = false; + VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio); /* @@ -1602,9 +1625,14 @@ void folio_end_writeback(struct folio *folio) * reused before the folio_wake_bit(). */ folio_get(folio); + if (!folio_test_dirty(folio)) + folio_dropbehind = folio_test_clear_dropbehind(folio); if (__folio_end_writeback(folio)) folio_wake_bit(folio, PG_writeback); acct_reclaim_writeback(folio); + + if (folio_dropbehind) + folio_end_dropbehind_write(folio); folio_put(folio); } EXPORT_SYMBOL(folio_end_writeback); -- cgit v1.2.3 From dddc559f2e7cff9c6525150cd29ef3a4f6692b26 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:48 -0700 Subject: mm/filemap: add filemap_fdatawrite_range_kick() helper Works like filemap_fdatawrite_range(), except it's a non-integrity data writeback and hence only starts writeback on the specified range. Will help facilitate generically starting uncached writeback from generic_write_sync(), as header dependencies preclude doing this inline from fs.h. Link: https://lkml.kernel.org/r/20241220154831.1086649-11-axboe@kernel.dk Signed-off-by: Jens Axboe Cc: Brian Foster Cc: Chris Mason Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/fs.h | 2 ++ mm/filemap.c | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+) (limited to 'mm/filemap.c') diff --git a/include/linux/fs.h b/include/linux/fs.h index 6a838b5479a6..653b5efa3d3f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2878,6 +2878,8 @@ extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart, extern int __must_check file_check_and_advance_wb_err(struct file *file); extern int __must_check file_write_and_wait_range(struct file *file, loff_t start, loff_t end); +int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, + loff_t end); static inline int file_write_and_wait(struct file *file) { diff --git a/mm/filemap.c b/mm/filemap.c index fb17b573ae51..0aa3861aed45 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -440,6 +440,24 @@ int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, } EXPORT_SYMBOL(filemap_fdatawrite_range); +/** + * filemap_fdatawrite_range_kick - start writeback on a range + * @mapping: target address_space + * @start: index to start writeback on + * @end: last (non-inclusive) index for writeback + * + * This is a non-integrity writeback helper, to start writing back folios + * for the indicated range. + * + * Return: %0 on success, negative error code otherwise. + */ +int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, + loff_t end) +{ + return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_NONE); +} +EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick); + /** * filemap_flush - mostly a non-blocking flush * @mapping: target address_space -- cgit v1.2.3 From d94d23fdd7529f1f3218235d1e0a69e9856907b7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:50 -0700 Subject: mm: add FGP_DONTCACHE folio creation flag Callers can pass this in for uncached folio creation, in which case if a folio is newly created it gets marked as uncached. If a folio exists for this index and lookup succeeds, then it will not get marked as uncached. If an !uncached lookup finds a cached folio, clear the flag. For that case, there are competeting uncached and cached users of the folio, and it should not get pruned. Link: https://lkml.kernel.org/r/20241220154831.1086649-13-axboe@kernel.dk Signed-off-by: Jens Axboe Cc: Brian Foster Cc: Chris Mason Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 2 ++ mm/filemap.c | 5 +++++ 2 files changed, 7 insertions(+) (limited to 'mm/filemap.c') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index d53c49abead6..47bfc6b1b632 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -710,6 +710,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, * * %FGP_NOFS - __GFP_FS will get cleared in gfp. * * %FGP_NOWAIT - Don't block on the folio lock. * * %FGP_STABLE - Wait for the folio to be stable (finished writeback) + * * %FGP_DONTCACHE - Uncached buffered IO * * %FGP_WRITEBEGIN - The flags to use in a filesystem write_begin() * implementation. */ @@ -723,6 +724,7 @@ typedef unsigned int __bitwise fgf_t; #define FGP_NOWAIT ((__force fgf_t)0x00000020) #define FGP_FOR_MMAP ((__force fgf_t)0x00000040) #define FGP_STABLE ((__force fgf_t)0x00000080) +#define FGP_DONTCACHE ((__force fgf_t)0x00000100) #define FGF_GET_ORDER(fgf) (((__force unsigned)fgf) >> 26) /* top 6 bits */ #define FGP_WRITEBEGIN (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE) diff --git a/mm/filemap.c b/mm/filemap.c index 0aa3861aed45..279959cf9300 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1973,6 +1973,8 @@ no_page: /* Init accessed so avoid atomic mark_page_accessed later */ if (fgp_flags & FGP_ACCESSED) __folio_set_referenced(folio); + if (fgp_flags & FGP_DONTCACHE) + __folio_set_dropbehind(folio); err = filemap_add_folio(mapping, folio, index, gfp); if (!err) @@ -1995,6 +1997,9 @@ no_page: if (!folio) return ERR_PTR(-ENOENT); + /* not an uncached lookup, clear uncached if set */ + if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE)) + folio_clear_dropbehind(folio); return folio; } EXPORT_SYMBOL(__filemap_get_folio); -- cgit v1.2.3