From 80c1fe902691d3ef4786f9e62e47a0aa0deb8b54 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Sat, 30 Nov 2019 17:49:41 -0800 Subject: mm/filemap.c: remove redundant cache invalidation after async direct-io write generic_file_direct_write() invalidates cache at entry. Second time this should be done when request completes. But this function calls second invalidation at exit unconditionally even for async requests. This patch skips second invalidation for async requests (-EIOCBQUEUED). Link: http://lkml.kernel.org/r/157270037850.4812.15036239021726025572.stgit@buzz Signed-off-by: Konstantin Khlebnikov Reviewed-by: Andrew Morton Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 85b7d087eb45..288e38199068 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3218,9 +3218,11 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) * Most of the time we do not need this since dio_complete() will do * the invalidation for us. However there are some file systems that * do not end up with dio_complete() being called, so let's not break - * them by removing it completely + * them by removing it completely. + * + * Skip invalidation for async writes or if mapping has no pages. */ - if (mapping->nrpages) + if (written > 0 && mapping->nrpages) invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end); -- cgit v1.2.3 From a92853b6746fe5ffef20a7c30addf6320561e669 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Sat, 30 Nov 2019 17:49:44 -0800 Subject: fs/direct-io.c: keep dio_warn_stale_pagecache() when CONFIG_BLOCK=n This helper prints warning if direct I/O write failed to invalidate cache, and set EIO at inode to warn usersapce about possible data corruption. See also commit 5a9d929d6e13 ("iomap: report collisions between directio and buffered writes to userspace"). Direct I/O is supported by non-disk filesystems, for example NFS. Thus generic code needs this even in kernel without CONFIG_BLOCK. Link: http://lkml.kernel.org/r/157270038074.4812.7980855544557488880.stgit@buzz Signed-off-by: Konstantin Khlebnikov Reviewed-by: Andrew Morton Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/direct-io.c | 21 --------------------- include/linux/fs.h | 6 +++++- mm/filemap.c | 21 +++++++++++++++++++++ 3 files changed, 26 insertions(+), 22 deletions(-) (limited to 'mm/filemap.c') diff --git a/fs/direct-io.c b/fs/direct-io.c index 9329ced91f1d..0ec4f270139f 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -220,27 +220,6 @@ static inline struct page *dio_get_page(struct dio *dio, return dio->pages[sdio->head]; } -/* - * Warn about a page cache invalidation failure during a direct io write. - */ -void dio_warn_stale_pagecache(struct file *filp) -{ - static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); - char pathname[128]; - struct inode *inode = file_inode(filp); - char *path; - - errseq_set(&inode->i_mapping->wb_err, -EIO); - if (__ratelimit(&_rs)) { - path = file_path(filp, pathname, sizeof(pathname)); - if (IS_ERR(path)) - path = "(unknown)"; - pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n"); - pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid, - current->comm); - } -} - /* * dio_complete() - called when all DIO BIO I/O has been completed * diff --git a/include/linux/fs.h b/include/linux/fs.h index ae6c5c37f3ae..eeed80fab36a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3149,7 +3149,6 @@ enum { }; void dio_end_io(struct bio *bio); -void dio_warn_stale_pagecache(struct file *filp); ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter, @@ -3194,6 +3193,11 @@ static inline void inode_dio_end(struct inode *inode) wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); } +/* + * Warn about a page cache invalidation failure diring a direct I/O write. + */ +void dio_warn_stale_pagecache(struct file *filp); + extern void inode_set_flags(struct inode *inode, unsigned int flags, unsigned int mask); diff --git a/mm/filemap.c b/mm/filemap.c index 288e38199068..189b8f318da2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3161,6 +3161,27 @@ int pagecache_write_end(struct file *file, struct address_space *mapping, } EXPORT_SYMBOL(pagecache_write_end); +/* + * Warn about a page cache invalidation failure during a direct I/O write. + */ +void dio_warn_stale_pagecache(struct file *filp) +{ + static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); + char pathname[128]; + struct inode *inode = file_inode(filp); + char *path; + + errseq_set(&inode->i_mapping->wb_err, -EIO); + if (__ratelimit(&_rs)) { + path = file_path(filp, pathname, sizeof(pathname)); + if (IS_ERR(path)) + path = "(unknown)"; + pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n"); + pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid, + current->comm); + } +} + ssize_t generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) { -- cgit v1.2.3 From 9266a14033a81b3096feccd10542c20b3f47fe8e Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Sat, 30 Nov 2019 17:49:47 -0800 Subject: mm/filemap.c: warn if stale pagecache is left after direct write generic_file_direct_write() tries to invalidate pagecache after O_DIRECT write. Unlike to similar code in dio_complete() this silently ignores error returned from invalidate_inode_pages2_range(). According to comment this code here because not all filesystems call dio_complete() to do proper invalidation after O_DIRECT write. Noticeable example is a blkdev_direct_IO(). This patch calls dio_warn_stale_pagecache() if invalidation fails. Link: http://lkml.kernel.org/r/157270038294.4812.2238891109785106069.stgit@buzz Signed-off-by: Konstantin Khlebnikov Reviewed-by: Andrew Morton Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 189b8f318da2..dc3b78db079b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3241,11 +3241,13 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) * do not end up with dio_complete() being called, so let's not break * them by removing it completely. * + * Noticeable example is a blkdev_direct_IO(). + * * Skip invalidation for async writes or if mapping has no pages. */ - if (written > 0 && mapping->nrpages) - invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, end); + if (written > 0 && mapping->nrpages && + invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end)) + dio_warn_stale_pagecache(file); if (written > 0) { pos += written; -- cgit v1.2.3 From 89b15332af7c0312a41e50846819ca6613b58b4c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sat, 30 Nov 2019 17:50:22 -0800 Subject: mm: drop mmap_sem before calling balance_dirty_pages() in write fault One of our services is observing hanging ps/top/etc under heavy write IO, and the task states show this is an mmap_sem priority inversion: A write fault is holding the mmap_sem in read-mode and waiting for (heavily cgroup-limited) IO in balance_dirty_pages(): balance_dirty_pages+0x724/0x905 balance_dirty_pages_ratelimited+0x254/0x390 fault_dirty_shared_page.isra.96+0x4a/0x90 do_wp_page+0x33e/0x400 __handle_mm_fault+0x6f0/0xfa0 handle_mm_fault+0xe4/0x200 __do_page_fault+0x22b/0x4a0 page_fault+0x45/0x50 Somebody tries to change the address space, contending for the mmap_sem in write-mode: call_rwsem_down_write_failed_killable+0x13/0x20 do_mprotect_pkey+0xa8/0x330 SyS_mprotect+0xf/0x20 do_syscall_64+0x5b/0x100 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 The waiting writer locks out all subsequent readers to avoid lock starvation, and several threads can be seen hanging like this: call_rwsem_down_read_failed+0x14/0x30 proc_pid_cmdline_read+0xa0/0x480 __vfs_read+0x23/0x140 vfs_read+0x87/0x130 SyS_read+0x42/0x90 do_syscall_64+0x5b/0x100 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 To fix this, do what we do for cache read faults already: drop the mmap_sem before calling into anything IO bound, in this case the balance_dirty_pages() function, and return VM_FAULT_RETRY. Link: http://lkml.kernel.org/r/20190924194238.GA29030@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Matthew Wilcox (Oracle) Acked-by: Kirill A. Shutemov Cc: Josef Bacik Cc: Hillf Danton Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 21 --------------------- mm/internal.h | 21 +++++++++++++++++++++ mm/memory.c | 38 +++++++++++++++++++++++++++----------- 3 files changed, 48 insertions(+), 32 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index dc3b78db079b..bf6aa30be58d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2329,27 +2329,6 @@ EXPORT_SYMBOL(generic_file_read_iter); #ifdef CONFIG_MMU #define MMAP_LOTSAMISS (100) -static struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, - struct file *fpin) -{ - int flags = vmf->flags; - - if (fpin) - return fpin; - - /* - * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or - * anything, so we only pin the file and drop the mmap_sem if only - * FAULT_FLAG_ALLOW_RETRY is set. - */ - if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) == - FAULT_FLAG_ALLOW_RETRY) { - fpin = get_file(vmf->vma->vm_file); - up_read(&vmf->vma->vm_mm->mmap_sem); - } - return fpin; -} - /* * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem * @vmf - the vm_fault for this fault. diff --git a/mm/internal.h b/mm/internal.h index 0d5f720c75ab..7dd7fbb577a9 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -362,6 +362,27 @@ vma_address(struct page *page, struct vm_area_struct *vma) return max(start, vma->vm_start); } +static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, + struct file *fpin) +{ + int flags = vmf->flags; + + if (fpin) + return fpin; + + /* + * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or + * anything, so we only pin the file and drop the mmap_sem if only + * FAULT_FLAG_ALLOW_RETRY is set. + */ + if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) == + FAULT_FLAG_ALLOW_RETRY) { + fpin = get_file(vmf->vma->vm_file); + up_read(&vmf->vma->vm_mm->mmap_sem); + } + return fpin; +} + #else /* !CONFIG_MMU */ static inline void clear_page_mlock(struct page *page) { } static inline void mlock_vma_page(struct page *page) { } diff --git a/mm/memory.c b/mm/memory.c index b6a5d6a08438..9ea917e28ef4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2289,10 +2289,11 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf) * * The function expects the page to be locked and unlocks it. */ -static void fault_dirty_shared_page(struct vm_area_struct *vma, - struct page *page) +static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct address_space *mapping; + struct page *page = vmf->page; bool dirtied; bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite; @@ -2307,16 +2308,30 @@ static void fault_dirty_shared_page(struct vm_area_struct *vma, mapping = page_rmapping(page); unlock_page(page); + if (!page_mkwrite) + file_update_time(vma->vm_file); + + /* + * Throttle page dirtying rate down to writeback speed. + * + * mapping may be NULL here because some device drivers do not + * set page.mapping but still dirty their pages + * + * Drop the mmap_sem before waiting on IO, if we can. The file + * is pinning the mapping, as per above. + */ if ((dirtied || page_mkwrite) && mapping) { - /* - * Some device drivers do not set page.mapping - * but still dirty their pages - */ + struct file *fpin; + + fpin = maybe_unlock_mmap_for_io(vmf, NULL); balance_dirty_pages_ratelimited(mapping); + if (fpin) { + fput(fpin); + return VM_FAULT_RETRY; + } } - if (!page_mkwrite) - file_update_time(vma->vm_file); + return 0; } /* @@ -2571,6 +2586,7 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; + vm_fault_t ret = VM_FAULT_WRITE; get_page(vmf->page); @@ -2594,10 +2610,10 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf) wp_page_reuse(vmf); lock_page(vmf->page); } - fault_dirty_shared_page(vma, vmf->page); + ret |= fault_dirty_shared_page(vmf); put_page(vmf->page); - return VM_FAULT_WRITE; + return ret; } /* @@ -3641,7 +3657,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) return ret; } - fault_dirty_shared_page(vma, vmf->page); + ret |= fault_dirty_shared_page(vmf); return ret; } -- cgit v1.2.3