From c37222082f23c456664d1c3182a714670ab8f9a4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 May 2023 14:49:48 +0100 Subject: splice: Fix filemap_splice_read() to use the correct inode Fix filemap_splice_read() to use file->f_mapping->host, not file->f_inode, as the source of the file size because in the case of a block device, file->f_inode points to the block-special file (which is typically 0 length) and not the backing store. Fixes: 07073eb01c5f ("splice: Add a func to do a splice from a buffered file without ITER_PIPE") Signed-off-by: David Howells Reviewed-by: Christoph Hellwig Reviewed-by: Christian Brauner cc: Steve French cc: Jens Axboe cc: Al Viro cc: David Hildenbrand cc: John Hubbard cc: linux-mm@kvack.org cc: linux-block@vger.kernel.org cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20230522135018.2742245-2-dhowells@redhat.com Signed-off-by: Jens Axboe --- mm/filemap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index b4c9bd368b7e..a2006936a6ae 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2900,7 +2900,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, do { cond_resched(); - if (*ppos >= i_size_read(file_inode(in))) + if (*ppos >= i_size_read(in->f_mapping->host)) break; iocb.ki_pos = *ppos; @@ -2916,7 +2916,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, * part of the page is not copied back to userspace (unless * another truncate extends the file - this is desired though). */ - isize = i_size_read(file_inode(in)); + isize = i_size_read(in->f_mapping->host); if (unlikely(*ppos >= isize)) break; end_offset = min_t(loff_t, isize, *ppos + len); -- cgit v1.2.3 From 83aeff881e53fab5b46e4ceb5fb47fa0a22acba9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 May 2023 14:49:49 +0100 Subject: splice: Make filemap_splice_read() check s_maxbytes Make filemap_splice_read() check s_maxbytes analogously to filemap_read(). Signed-off-by: David Howells Reviewed-by: Christoph Hellwig Reviewed-by: Christian Brauner cc: Steve French cc: Jens Axboe cc: Al Viro cc: David Hildenbrand cc: John Hubbard cc: linux-mm@kvack.org cc: linux-block@vger.kernel.org cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20230522135018.2742245-3-dhowells@redhat.com Signed-off-by: Jens Axboe --- mm/filemap.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index a2006936a6ae..0fcb0b80c2e2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2887,6 +2887,9 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, bool writably_mapped; int i, error = 0; + if (unlikely(*ppos >= in->f_mapping->host->i_sb->s_maxbytes)) + return 0; + init_sync_kiocb(&iocb, in); iocb.ki_pos = *ppos; -- cgit v1.2.3 From 3fc40265ae2b48a7475c41c5c0b256374c419f4b Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 May 2023 14:50:17 +0100 Subject: iov_iter: Kill ITER_PIPE The ITER_PIPE-type iterator was only used by generic_file_splice_read() and that has been replaced and removed. This leaves ITER_PIPE unused - so remove it too. Signed-off-by: David Howells Reviewed-by: Christoph Hellwig Reviewed-by: Christian Brauner cc: Jens Axboe cc: Al Viro cc: David Hildenbrand cc: John Hubbard cc: linux-mm@kvack.org cc: linux-block@vger.kernel.org cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20230522135018.2742245-31-dhowells@redhat.com Signed-off-by: Jens Axboe --- include/linux/uio.h | 14 -- lib/iov_iter.c | 431 +--------------------------------------------------- mm/filemap.c | 3 +- 3 files changed, 4 insertions(+), 444 deletions(-) (limited to 'mm/filemap.c') diff --git a/include/linux/uio.h b/include/linux/uio.h index 044c1d8c230c..60c342bb7ab8 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -11,7 +11,6 @@ #include struct page; -struct pipe_inode_info; typedef unsigned int __bitwise iov_iter_extraction_t; @@ -25,7 +24,6 @@ enum iter_type { ITER_IOVEC, ITER_KVEC, ITER_BVEC, - ITER_PIPE, ITER_XARRAY, ITER_DISCARD, ITER_UBUF, @@ -74,7 +72,6 @@ struct iov_iter { const struct kvec *kvec; const struct bio_vec *bvec; struct xarray *xarray; - struct pipe_inode_info *pipe; void __user *ubuf; }; size_t count; @@ -82,10 +79,6 @@ struct iov_iter { }; union { unsigned long nr_segs; - struct { - unsigned int head; - unsigned int start_head; - }; loff_t xarray_start; }; }; @@ -133,11 +126,6 @@ static inline bool iov_iter_is_bvec(const struct iov_iter *i) return iov_iter_type(i) == ITER_BVEC; } -static inline bool iov_iter_is_pipe(const struct iov_iter *i) -{ - return iov_iter_type(i) == ITER_PIPE; -} - static inline bool iov_iter_is_discard(const struct iov_iter *i) { return iov_iter_type(i) == ITER_DISCARD; @@ -286,8 +274,6 @@ void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec unsigned long nr_segs, size_t count); void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count); -void iov_iter_pipe(struct iov_iter *i, unsigned int direction, struct pipe_inode_info *pipe, - size_t count); void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count); void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray, loff_t start, size_t count); diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 960223ed9199..f18138e0292a 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -14,8 +14,6 @@ #include #include -#define PIPE_PARANOIA /* for now */ - /* covers ubuf and kbuf alike */ #define iterate_buf(i, n, base, len, off, __p, STEP) { \ size_t __maybe_unused off = 0; \ @@ -198,150 +196,6 @@ static int copyin(void *to, const void __user *from, size_t n) return res; } -#ifdef PIPE_PARANOIA -static bool sanity(const struct iov_iter *i) -{ - struct pipe_inode_info *pipe = i->pipe; - unsigned int p_head = pipe->head; - unsigned int p_tail = pipe->tail; - unsigned int p_occupancy = pipe_occupancy(p_head, p_tail); - unsigned int i_head = i->head; - unsigned int idx; - - if (i->last_offset) { - struct pipe_buffer *p; - if (unlikely(p_occupancy == 0)) - goto Bad; // pipe must be non-empty - if (unlikely(i_head != p_head - 1)) - goto Bad; // must be at the last buffer... - - p = pipe_buf(pipe, i_head); - if (unlikely(p->offset + p->len != abs(i->last_offset))) - goto Bad; // ... at the end of segment - } else { - if (i_head != p_head) - goto Bad; // must be right after the last buffer - } - return true; -Bad: - printk(KERN_ERR "idx = %d, offset = %d\n", i_head, i->last_offset); - printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n", - p_head, p_tail, pipe->ring_size); - for (idx = 0; idx < pipe->ring_size; idx++) - printk(KERN_ERR "[%p %p %d %d]\n", - pipe->bufs[idx].ops, - pipe->bufs[idx].page, - pipe->bufs[idx].offset, - pipe->bufs[idx].len); - WARN_ON(1); - return false; -} -#else -#define sanity(i) true -#endif - -static struct page *push_anon(struct pipe_inode_info *pipe, unsigned size) -{ - struct page *page = alloc_page(GFP_USER); - if (page) { - struct pipe_buffer *buf = pipe_buf(pipe, pipe->head++); - *buf = (struct pipe_buffer) { - .ops = &default_pipe_buf_ops, - .page = page, - .offset = 0, - .len = size - }; - } - return page; -} - -static void push_page(struct pipe_inode_info *pipe, struct page *page, - unsigned int offset, unsigned int size) -{ - struct pipe_buffer *buf = pipe_buf(pipe, pipe->head++); - *buf = (struct pipe_buffer) { - .ops = &page_cache_pipe_buf_ops, - .page = page, - .offset = offset, - .len = size - }; - get_page(page); -} - -static inline int last_offset(const struct pipe_buffer *buf) -{ - if (buf->ops == &default_pipe_buf_ops) - return buf->len; // buf->offset is 0 for those - else - return -(buf->offset + buf->len); -} - -static struct page *append_pipe(struct iov_iter *i, size_t size, - unsigned int *off) -{ - struct pipe_inode_info *pipe = i->pipe; - int offset = i->last_offset; - struct pipe_buffer *buf; - struct page *page; - - if (offset > 0 && offset < PAGE_SIZE) { - // some space in the last buffer; add to it - buf = pipe_buf(pipe, pipe->head - 1); - size = min_t(size_t, size, PAGE_SIZE - offset); - buf->len += size; - i->last_offset += size; - i->count -= size; - *off = offset; - return buf->page; - } - // OK, we need a new buffer - *off = 0; - size = min_t(size_t, size, PAGE_SIZE); - if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) - return NULL; - page = push_anon(pipe, size); - if (!page) - return NULL; - i->head = pipe->head - 1; - i->last_offset = size; - i->count -= size; - return page; -} - -static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes, - struct iov_iter *i) -{ - struct pipe_inode_info *pipe = i->pipe; - unsigned int head = pipe->head; - - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - if (!sanity(i)) - return 0; - - if (offset && i->last_offset == -offset) { // could we merge it? - struct pipe_buffer *buf = pipe_buf(pipe, head - 1); - if (buf->page == page) { - buf->len += bytes; - i->last_offset -= bytes; - i->count -= bytes; - return bytes; - } - } - if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) - return 0; - - push_page(pipe, page, offset, bytes); - i->last_offset = -(offset + bytes); - i->head = head; - i->count -= bytes; - return bytes; -} - /* * fault_in_iov_iter_readable - fault in iov iterator for reading * @i: iterator @@ -446,46 +300,6 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction, } EXPORT_SYMBOL(iov_iter_init); -// returns the offset in partial buffer (if any) -static inline unsigned int pipe_npages(const struct iov_iter *i, int *npages) -{ - struct pipe_inode_info *pipe = i->pipe; - int used = pipe->head - pipe->tail; - int off = i->last_offset; - - *npages = max((int)pipe->max_usage - used, 0); - - if (off > 0 && off < PAGE_SIZE) { // anon and not full - (*npages)++; - return off; - } - return 0; -} - -static size_t copy_pipe_to_iter(const void *addr, size_t bytes, - struct iov_iter *i) -{ - unsigned int off, chunk; - - if (unlikely(bytes > i->count)) - bytes = i->count; - if (unlikely(!bytes)) - return 0; - - if (!sanity(i)) - return 0; - - for (size_t n = bytes; n; n -= chunk) { - struct page *page = append_pipe(i, n, &off); - chunk = min_t(size_t, n, PAGE_SIZE - off); - if (!page) - return bytes - n; - memcpy_to_page(page, off, addr, chunk); - addr += chunk; - } - return bytes; -} - static __wsum csum_and_memcpy(void *to, const void *from, size_t len, __wsum sum, size_t off) { @@ -493,44 +307,10 @@ static __wsum csum_and_memcpy(void *to, const void *from, size_t len, return csum_block_add(sum, next, off); } -static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes, - struct iov_iter *i, __wsum *sump) -{ - __wsum sum = *sump; - size_t off = 0; - unsigned int chunk, r; - - if (unlikely(bytes > i->count)) - bytes = i->count; - if (unlikely(!bytes)) - return 0; - - if (!sanity(i)) - return 0; - - while (bytes) { - struct page *page = append_pipe(i, bytes, &r); - char *p; - - if (!page) - break; - chunk = min_t(size_t, bytes, PAGE_SIZE - r); - p = kmap_local_page(page); - sum = csum_and_memcpy(p + r, addr + off, chunk, sum, off); - kunmap_local(p); - off += chunk; - bytes -= chunk; - } - *sump = sum; - return off; -} - size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(i->data_source)) return 0; - if (unlikely(iov_iter_is_pipe(i))) - return copy_pipe_to_iter(addr, bytes, i); if (user_backed_iter(i)) might_fault(); iterate_and_advance(i, bytes, base, len, off, @@ -552,42 +332,6 @@ static int copyout_mc(void __user *to, const void *from, size_t n) return n; } -static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes, - struct iov_iter *i) -{ - size_t xfer = 0; - unsigned int off, chunk; - - if (unlikely(bytes > i->count)) - bytes = i->count; - if (unlikely(!bytes)) - return 0; - - if (!sanity(i)) - return 0; - - while (bytes) { - struct page *page = append_pipe(i, bytes, &off); - unsigned long rem; - char *p; - - if (!page) - break; - chunk = min_t(size_t, bytes, PAGE_SIZE - off); - p = kmap_local_page(page); - rem = copy_mc_to_kernel(p + off, addr + xfer, chunk); - chunk -= rem; - kunmap_local(p); - xfer += chunk; - bytes -= chunk; - if (rem) { - iov_iter_revert(i, rem); - break; - } - } - return xfer; -} - /** * _copy_mc_to_iter - copy to iter with source memory error exception handling * @addr: source kernel address @@ -607,9 +351,8 @@ static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes, * alignment and poison alignment assumptions to avoid re-triggering * hardware exceptions. * - * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies. - * Compare to copy_to_iter() where only ITER_IOVEC attempts might return - * a short copy. + * * ITER_KVEC and ITER_BVEC can return short copies. Compare to + * copy_to_iter() where only ITER_IOVEC attempts might return a short copy. * * Return: number of bytes copied (may be %0) */ @@ -617,8 +360,6 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(i->data_source)) return 0; - if (unlikely(iov_iter_is_pipe(i))) - return copy_mc_pipe_to_iter(addr, bytes, i); if (user_backed_iter(i)) might_fault(); __iterate_and_advance(i, bytes, base, len, off, @@ -732,8 +473,6 @@ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, return 0; if (WARN_ON_ONCE(i->data_source)) return 0; - if (unlikely(iov_iter_is_pipe(i))) - return copy_page_to_iter_pipe(page, offset, bytes, i); page += offset / PAGE_SIZE; // first subpage offset %= PAGE_SIZE; while (1) { @@ -764,8 +503,6 @@ size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t byte return 0; if (WARN_ON_ONCE(i->data_source)) return 0; - if (unlikely(iov_iter_is_pipe(i))) - return copy_page_to_iter_pipe(page, offset, bytes, i); page += offset / PAGE_SIZE; // first subpage offset %= PAGE_SIZE; while (1) { @@ -818,36 +555,8 @@ size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, } EXPORT_SYMBOL(copy_page_from_iter); -static size_t pipe_zero(size_t bytes, struct iov_iter *i) -{ - unsigned int chunk, off; - - if (unlikely(bytes > i->count)) - bytes = i->count; - if (unlikely(!bytes)) - return 0; - - if (!sanity(i)) - return 0; - - for (size_t n = bytes; n; n -= chunk) { - struct page *page = append_pipe(i, n, &off); - char *p; - - if (!page) - return bytes - n; - chunk = min_t(size_t, n, PAGE_SIZE - off); - p = kmap_local_page(page); - memset(p + off, 0, chunk); - kunmap_local(p); - } - return bytes; -} - size_t iov_iter_zero(size_t bytes, struct iov_iter *i) { - if (unlikely(iov_iter_is_pipe(i))) - return pipe_zero(bytes, i); iterate_and_advance(i, bytes, base, len, count, clear_user(base, len), memset(base, 0, len) @@ -878,32 +587,6 @@ size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t byt } EXPORT_SYMBOL(copy_page_from_iter_atomic); -static void pipe_advance(struct iov_iter *i, size_t size) -{ - struct pipe_inode_info *pipe = i->pipe; - int off = i->last_offset; - - if (!off && !size) { - pipe_discard_from(pipe, i->start_head); // discard everything - return; - } - i->count -= size; - while (1) { - struct pipe_buffer *buf = pipe_buf(pipe, i->head); - if (off) /* make it relative to the beginning of buffer */ - size += abs(off) - buf->offset; - if (size <= buf->len) { - buf->len = size; - i->last_offset = last_offset(buf); - break; - } - size -= buf->len; - i->head++; - off = 0; - } - pipe_discard_from(pipe, i->head + 1); // discard everything past this one -} - static void iov_iter_bvec_advance(struct iov_iter *i, size_t size) { const struct bio_vec *bvec, *end; @@ -955,8 +638,6 @@ void iov_iter_advance(struct iov_iter *i, size_t size) iov_iter_iovec_advance(i, size); } else if (iov_iter_is_bvec(i)) { iov_iter_bvec_advance(i, size); - } else if (iov_iter_is_pipe(i)) { - pipe_advance(i, size); } else if (iov_iter_is_discard(i)) { i->count -= size; } @@ -970,26 +651,6 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll) if (WARN_ON(unroll > MAX_RW_COUNT)) return; i->count += unroll; - if (unlikely(iov_iter_is_pipe(i))) { - struct pipe_inode_info *pipe = i->pipe; - unsigned int head = pipe->head; - - while (head > i->start_head) { - struct pipe_buffer *b = pipe_buf(pipe, --head); - if (unroll < b->len) { - b->len -= unroll; - i->last_offset = last_offset(b); - i->head = head; - return; - } - unroll -= b->len; - pipe_buf_release(pipe, b); - pipe->head--; - } - i->last_offset = 0; - i->head = head; - return; - } if (unlikely(iov_iter_is_discard(i))) return; if (unroll <= i->iov_offset) { @@ -1079,24 +740,6 @@ void iov_iter_bvec(struct iov_iter *i, unsigned int direction, } EXPORT_SYMBOL(iov_iter_bvec); -void iov_iter_pipe(struct iov_iter *i, unsigned int direction, - struct pipe_inode_info *pipe, - size_t count) -{ - BUG_ON(direction != READ); - WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size)); - *i = (struct iov_iter){ - .iter_type = ITER_PIPE, - .data_source = false, - .pipe = pipe, - .head = pipe->head, - .start_head = pipe->head, - .last_offset = 0, - .count = count - }; -} -EXPORT_SYMBOL(iov_iter_pipe); - /** * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray * @i: The iterator to initialise. @@ -1224,19 +867,6 @@ bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, if (iov_iter_is_bvec(i)) return iov_iter_aligned_bvec(i, addr_mask, len_mask); - if (iov_iter_is_pipe(i)) { - size_t size = i->count; - - if (size & len_mask) - return false; - if (size && i->last_offset > 0) { - if (i->last_offset & addr_mask) - return false; - } - - return true; - } - if (iov_iter_is_xarray(i)) { if (i->count & len_mask) return false; @@ -1307,14 +937,6 @@ unsigned long iov_iter_alignment(const struct iov_iter *i) if (iov_iter_is_bvec(i)) return iov_iter_alignment_bvec(i); - if (iov_iter_is_pipe(i)) { - size_t size = i->count; - - if (size && i->last_offset > 0) - return size | i->last_offset; - return size; - } - if (iov_iter_is_xarray(i)) return (i->xarray_start + i->iov_offset) | i->count; @@ -1367,36 +989,6 @@ static int want_pages_array(struct page ***res, size_t size, return count; } -static ssize_t pipe_get_pages(struct iov_iter *i, - struct page ***pages, size_t maxsize, unsigned maxpages, - size_t *start) -{ - unsigned int npages, count, off, chunk; - struct page **p; - size_t left; - - if (!sanity(i)) - return -EFAULT; - - *start = off = pipe_npages(i, &npages); - if (!npages) - return -EFAULT; - count = want_pages_array(pages, maxsize, off, min(npages, maxpages)); - if (!count) - return -ENOMEM; - p = *pages; - for (npages = 0, left = maxsize ; npages < count; npages++, left -= chunk) { - struct page *page = append_pipe(i, left, &off); - if (!page) - break; - chunk = min_t(size_t, left, PAGE_SIZE - off); - get_page(*p++ = page); - } - if (!npages) - return -EFAULT; - return maxsize - left; -} - static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa, pgoff_t index, unsigned int nr_pages) { @@ -1547,8 +1139,6 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, } return maxsize; } - if (iov_iter_is_pipe(i)) - return pipe_get_pages(i, pages, maxsize, maxpages, start); if (iov_iter_is_xarray(i)) return iter_xarray_get_pages(i, pages, maxsize, maxpages, start); return -EFAULT; @@ -1638,9 +1228,7 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate, } sum = csum_shift(csstate->csum, csstate->off); - if (unlikely(iov_iter_is_pipe(i))) - bytes = csum_and_copy_to_pipe_iter(addr, bytes, i, &sum); - else iterate_and_advance(i, bytes, base, len, off, ({ + iterate_and_advance(i, bytes, base, len, off, ({ next = csum_and_copy_to_user(addr + off, base, len); sum = csum_block_add(sum, next, off); next ? 0 : len; @@ -1725,15 +1313,6 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages) return iov_npages(i, maxpages); if (iov_iter_is_bvec(i)) return bvec_npages(i, maxpages); - if (iov_iter_is_pipe(i)) { - int npages; - - if (!sanity(i)) - return 0; - - pipe_npages(i, &npages); - return min(npages, maxpages); - } if (iov_iter_is_xarray(i)) { unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE; int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); @@ -1746,10 +1325,6 @@ EXPORT_SYMBOL(iov_iter_npages); const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) { *new = *old; - if (unlikely(iov_iter_is_pipe(new))) { - WARN_ON(1); - return NULL; - } if (iov_iter_is_bvec(new)) return new->bvec = kmemdup(new->bvec, new->nr_segs * sizeof(struct bio_vec), diff --git a/mm/filemap.c b/mm/filemap.c index 0fcb0b80c2e2..603b562d69b1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2687,8 +2687,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, if (unlikely(iocb->ki_pos >= i_size_read(inode))) break; - error = filemap_get_pages(iocb, iter->count, &fbatch, - iov_iter_is_pipe(iter)); + error = filemap_get_pages(iocb, iter->count, &fbatch, false); if (error < 0) break; -- cgit v1.2.3 From 9eee8bd81421c5e961cbb1a3c3fa1a06fad545e8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 May 2023 14:50:18 +0100 Subject: splice: kdoc for filemap_splice_read() and copy_splice_read() Provide kerneldoc comments for filemap_splice_read() and copy_splice_read(). Signed-off-by: David Howells cc: Christian Brauner cc: Christoph Hellwig cc: Jens Axboe cc: Steve French cc: Al Viro cc: linux-mm@kvack.org cc: linux-block@vger.kernel.org cc: linux-cifs@vger.kernel.org cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20230522135018.2742245-32-dhowells@redhat.com Signed-off-by: Jens Axboe --- fs/splice.c | 21 +++++++++++++++++++-- mm/filemap.c | 21 ++++++++++++++++++--- 2 files changed, 37 insertions(+), 5 deletions(-) (limited to 'mm/filemap.c') diff --git a/fs/splice.c b/fs/splice.c index 9be4cb3b9879..2420ead610a7 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -299,8 +299,25 @@ void splice_shrink_spd(struct splice_pipe_desc *spd) kfree(spd->partial); } -/* - * Copy data from a file into pages and then splice those into the output pipe. +/** + * copy_splice_read - Copy data from a file and splice the copy into a pipe + * @in: The file to read from + * @ppos: Pointer to the file position to read from + * @pipe: The pipe to splice into + * @len: The amount to splice + * @flags: The SPLICE_F_* flags + * + * This function allocates a bunch of pages sufficient to hold the requested + * amount of data (but limited by the remaining pipe capacity), passes it to + * the file's ->read_iter() to read into and then splices the used pages into + * the pipe. + * + * Return: On success, the number of bytes read will be returned and *@ppos + * will be updated if appropriate; 0 will be returned if there is no more data + * to be read; -EAGAIN will be returned if the pipe had no space, and some + * other negative error code will be returned on error. A short read may occur + * if the pipe has insufficient space, we reach the end of the data or we hit a + * hole. */ ssize_t copy_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, diff --git a/mm/filemap.c b/mm/filemap.c index 603b562d69b1..f87e2ad8cff1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2871,9 +2871,24 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, return spliced; } -/* - * Splice folios from the pagecache of a buffered (ie. non-O_DIRECT) file into - * a pipe. +/** + * filemap_splice_read - Splice data from a file's pagecache into a pipe + * @in: The file to read from + * @ppos: Pointer to the file position to read from + * @pipe: The pipe to splice into + * @len: The amount to splice + * @flags: The SPLICE_F_* flags + * + * This function gets folios from a file's pagecache and splices them into the + * pipe. Readahead will be called as necessary to fill more folios. This may + * be used for blockdevs also. + * + * Return: On success, the number of bytes read will be returned and *@ppos + * will be updated if appropriate; 0 will be returned if there is no more data + * to be read; -EAGAIN will be returned if the pipe had no space, and some + * other negative error code will be returned on error. A short read may occur + * if the pipe has insufficient space, we reach the end of the data or we hit a + * hole. */ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, -- cgit v1.2.3 From cf264e1329fb0307e044f7675849f9f38b44c11a Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Tue, 2 May 2023 18:36:07 -0700 Subject: cachestat: implement cachestat syscall There is currently no good way to query the page cache state of large file sets and directory trees. There is mincore(), but it scales poorly: the kernel writes out a lot of bitmap data that userspace has to aggregate, when the user really doesn not care about per-page information in that case. The user also needs to mmap and unmap each file as it goes along, which can be quite slow as well. Some use cases where this information could come in handy: * Allowing database to decide whether to perform an index scan or direct table queries based on the in-memory cache state of the index. * Visibility into the writeback algorithm, for performance issues diagnostic. * Workload-aware writeback pacing: estimating IO fulfilled by page cache (and IO to be done) within a range of a file, allowing for more frequent syncing when and where there is IO capacity, and batching when there is not. * Computing memory usage of large files/directory trees, analogous to the du tool for disk usage. More information about these use cases could be found in the following thread: https://lore.kernel.org/lkml/20230315170934.GA97793@cmpxchg.org/ This patch implements a new syscall that queries cache state of a file and summarizes the number of cached pages, number of dirty pages, number of pages marked for writeback, number of (recently) evicted pages, etc. in a given range. Currently, the syscall is only wired in for x86 architecture. NAME cachestat - query the page cache statistics of a file. SYNOPSIS #include struct cachestat_range { __u64 off; __u64 len; }; struct cachestat { __u64 nr_cache; __u64 nr_dirty; __u64 nr_writeback; __u64 nr_evicted; __u64 nr_recently_evicted; }; int cachestat(unsigned int fd, struct cachestat_range *cstat_range, struct cachestat *cstat, unsigned int flags); DESCRIPTION cachestat() queries the number of cached pages, number of dirty pages, number of pages marked for writeback, number of evicted pages, number of recently evicted pages, in the bytes range given by `off` and `len`. An evicted page is a page that is previously in the page cache but has been evicted since. A page is recently evicted if its last eviction was recent enough that its reentry to the cache would indicate that it is actively being used by the system, and that there is memory pressure on the system. These values are returned in a cachestat struct, whose address is given by the `cstat` argument. The `off` and `len` arguments must be non-negative integers. If `len` > 0, the queried range is [`off`, `off` + `len`]. If `len` == 0, we will query in the range from `off` to the end of the file. The `flags` argument is unused for now, but is included for future extensibility. User should pass 0 (i.e no flag specified). Currently, hugetlbfs is not supported. Because the status of a page can change after cachestat() checks it but before it returns to the application, the returned values may contain stale information. RETURN VALUE On success, cachestat returns 0. On error, -1 is returned, and errno is set to indicate the error. ERRORS EFAULT cstat or cstat_args points to an invalid address. EINVAL invalid flags. EBADF invalid file descriptor. EOPNOTSUPP file descriptor is of a hugetlbfs file [nphamcs@gmail.com: replace rounddown logic with the existing helper] Link: https://lkml.kernel.org/r/20230504022044.3675469-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20230503013608.2431726-3-nphamcs@gmail.com Signed-off-by: Nhat Pham Acked-by: Johannes Weiner Cc: Brian Foster Cc: Matthew Wilcox (Oracle) Cc: Michael Kerrisk Signed-off-by: Andrew Morton --- arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + include/linux/syscalls.h | 5 + include/uapi/asm-generic/unistd.h | 5 +- include/uapi/linux/mman.h | 14 +++ init/Kconfig | 10 ++ kernel/sys_ni.c | 1 + mm/filemap.c | 171 +++++++++++++++++++++++++++++++++ 8 files changed, 207 insertions(+), 1 deletion(-) (limited to 'mm/filemap.c') diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 320480a8db4f..bc0a3c941b35 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -455,3 +455,4 @@ 448 i386 process_mrelease sys_process_mrelease 449 i386 futex_waitv sys_futex_waitv 450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node +451 i386 cachestat sys_cachestat diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index c84d12608cd2..227538b0ce80 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -372,6 +372,7 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common cachestat sys_cachestat # # Due to a historical design error, certain syscalls are numbered differently diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 33a0ee3bcb2e..6648c07c4381 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -72,6 +72,8 @@ struct open_how; struct mount_attr; struct landlock_ruleset_attr; enum landlock_rule_type; +struct cachestat_range; +struct cachestat; #include #include @@ -1058,6 +1060,9 @@ asmlinkage long sys_memfd_secret(unsigned int flags); asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len, unsigned long home_node, unsigned long flags); +asmlinkage long sys_cachestat(unsigned int fd, + struct cachestat_range __user *cstat_range, + struct cachestat __user *cstat, unsigned int flags); /* * Architecture-specific system calls diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 45fa180cc56a..cd639fae9086 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -886,8 +886,11 @@ __SYSCALL(__NR_futex_waitv, sys_futex_waitv) #define __NR_set_mempolicy_home_node 450 __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) +#define __NR_cachestat 451 +__SYSCALL(__NR_cachestat, sys_cachestat) + #undef __NR_syscalls -#define __NR_syscalls 451 +#define __NR_syscalls 452 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h index f55bc680b5b0..a246e11988d5 100644 --- a/include/uapi/linux/mman.h +++ b/include/uapi/linux/mman.h @@ -4,6 +4,7 @@ #include #include +#include #define MREMAP_MAYMOVE 1 #define MREMAP_FIXED 2 @@ -41,4 +42,17 @@ #define MAP_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB #define MAP_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB +struct cachestat_range { + __u64 off; + __u64 len; +}; + +struct cachestat { + __u64 nr_cache; + __u64 nr_dirty; + __u64 nr_writeback; + __u64 nr_evicted; + __u64 nr_recently_evicted; +}; + #endif /* _UAPI_LINUX_MMAN_H */ diff --git a/init/Kconfig b/init/Kconfig index 32c24950c4ce..f7f65af4ee12 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1771,6 +1771,16 @@ config RSEQ If unsure, say Y. +config CACHESTAT_SYSCALL + bool "Enable cachestat() system call" if EXPERT + default y + help + Enable the cachestat system call, which queries the page cache + statistics of a file (number of cached pages, dirty pages, + pages marked for writeback, (recently) evicted pages). + + If unsure say Y here. + config DEBUG_RSEQ default n bool "Enabled debugging of rseq() system call" if EXPERT diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 860b2dcf3ac4..04bfb1e4d377 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -299,6 +299,7 @@ COND_SYSCALL(set_mempolicy); COND_SYSCALL(migrate_pages); COND_SYSCALL(move_pages); COND_SYSCALL(set_mempolicy_home_node); +COND_SYSCALL(cachestat); COND_SYSCALL(perf_event_open); COND_SYSCALL(accept4); diff --git a/mm/filemap.c b/mm/filemap.c index b4c9bd368b7e..2d3d70c64dfd 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -58,6 +59,8 @@ #include +#include "swap.h" + /* * Shared mappings implemented 30.11.1994. It's not fully working yet, * though. @@ -4119,3 +4122,171 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp) return try_to_free_buffers(folio); } EXPORT_SYMBOL(filemap_release_folio); + +#ifdef CONFIG_CACHESTAT_SYSCALL +/** + * filemap_cachestat() - compute the page cache statistics of a mapping + * @mapping: The mapping to compute the statistics for. + * @first_index: The starting page cache index. + * @last_index: The final page index (inclusive). + * @cs: the cachestat struct to write the result to. + * + * This will query the page cache statistics of a mapping in the + * page range of [first_index, last_index] (inclusive). The statistics + * queried include: number of dirty pages, number of pages marked for + * writeback, and the number of (recently) evicted pages. + */ +static void filemap_cachestat(struct address_space *mapping, + pgoff_t first_index, pgoff_t last_index, struct cachestat *cs) +{ + XA_STATE(xas, &mapping->i_pages, first_index); + struct folio *folio; + + rcu_read_lock(); + xas_for_each(&xas, folio, last_index) { + unsigned long nr_pages; + pgoff_t folio_first_index, folio_last_index; + + if (xas_retry(&xas, folio)) + continue; + + if (xa_is_value(folio)) { + /* page is evicted */ + void *shadow = (void *)folio; + bool workingset; /* not used */ + int order = xa_get_order(xas.xa, xas.xa_index); + + nr_pages = 1 << order; + folio_first_index = round_down(xas.xa_index, 1 << order); + folio_last_index = folio_first_index + nr_pages - 1; + + /* Folios might straddle the range boundaries, only count covered pages */ + if (folio_first_index < first_index) + nr_pages -= first_index - folio_first_index; + + if (folio_last_index > last_index) + nr_pages -= folio_last_index - last_index; + + cs->nr_evicted += nr_pages; + +#ifdef CONFIG_SWAP /* implies CONFIG_MMU */ + if (shmem_mapping(mapping)) { + /* shmem file - in swap cache */ + swp_entry_t swp = radix_to_swp_entry(folio); + + shadow = get_shadow_from_swap_cache(swp); + } +#endif + if (workingset_test_recent(shadow, true, &workingset)) + cs->nr_recently_evicted += nr_pages; + + goto resched; + } + + nr_pages = folio_nr_pages(folio); + folio_first_index = folio_pgoff(folio); + folio_last_index = folio_first_index + nr_pages - 1; + + /* Folios might straddle the range boundaries, only count covered pages */ + if (folio_first_index < first_index) + nr_pages -= first_index - folio_first_index; + + if (folio_last_index > last_index) + nr_pages -= folio_last_index - last_index; + + /* page is in cache */ + cs->nr_cache += nr_pages; + + if (folio_test_dirty(folio)) + cs->nr_dirty += nr_pages; + + if (folio_test_writeback(folio)) + cs->nr_writeback += nr_pages; + +resched: + if (need_resched()) { + xas_pause(&xas); + cond_resched_rcu(); + } + } + rcu_read_unlock(); +} + +/* + * The cachestat(2) system call. + * + * cachestat() returns the page cache statistics of a file in the + * bytes range specified by `off` and `len`: number of cached pages, + * number of dirty pages, number of pages marked for writeback, + * number of evicted pages, and number of recently evicted pages. + * + * An evicted page is a page that is previously in the page cache + * but has been evicted since. A page is recently evicted if its last + * eviction was recent enough that its reentry to the cache would + * indicate that it is actively being used by the system, and that + * there is memory pressure on the system. + * + * `off` and `len` must be non-negative integers. If `len` > 0, + * the queried range is [`off`, `off` + `len`]. If `len` == 0, + * we will query in the range from `off` to the end of the file. + * + * The `flags` argument is unused for now, but is included for future + * extensibility. User should pass 0 (i.e no flag specified). + * + * Currently, hugetlbfs is not supported. + * + * Because the status of a page can change after cachestat() checks it + * but before it returns to the application, the returned values may + * contain stale information. + * + * return values: + * zero - success + * -EFAULT - cstat or cstat_range points to an illegal address + * -EINVAL - invalid flags + * -EBADF - invalid file descriptor + * -EOPNOTSUPP - file descriptor is of a hugetlbfs file + */ +SYSCALL_DEFINE4(cachestat, unsigned int, fd, + struct cachestat_range __user *, cstat_range, + struct cachestat __user *, cstat, unsigned int, flags) +{ + struct fd f = fdget(fd); + struct address_space *mapping; + struct cachestat_range csr; + struct cachestat cs; + pgoff_t first_index, last_index; + + if (!f.file) + return -EBADF; + + if (copy_from_user(&csr, cstat_range, + sizeof(struct cachestat_range))) { + fdput(f); + return -EFAULT; + } + + /* hugetlbfs is not supported */ + if (is_file_hugepages(f.file)) { + fdput(f); + return -EOPNOTSUPP; + } + + if (flags != 0) { + fdput(f); + return -EINVAL; + } + + first_index = csr.off >> PAGE_SHIFT; + last_index = + csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT; + memset(&cs, 0, sizeof(struct cachestat)); + mapping = f.file->f_mapping; + filemap_cachestat(mapping, first_index, last_index, &cs); + fdput(f); + + if (copy_to_user(cstat, &cs, sizeof(struct cachestat))) + return -EFAULT; + + return 0; +} +#endif /* CONFIG_CACHESTAT_SYSCALL */ -- cgit v1.2.3 From c963901197188189e85b4d768a059fe1bbc2a502 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Wed, 10 May 2023 14:47:16 +0200 Subject: filemap: remove page_endio() page_endio() is not used anymore. Remove it. Link: https://lkml.kernel.org/r/20230510124716.73655-1-p.raghav@samsung.com Signed-off-by: Pankaj Raghav Reviewed-by: Christoph Hellwig Acked-by: Matthew Wilcox (Oracle) Cc: Luis Chamberlain Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 2 -- mm/filemap.c | 30 ------------------------------ 2 files changed, 32 deletions(-) (limited to 'mm/filemap.c') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index a56308a9d1a4..c1ae5ebc375f 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1078,8 +1078,6 @@ int filemap_migrate_folio(struct address_space *mapping, struct folio *dst, #else #define filemap_migrate_folio NULL #endif -void page_endio(struct page *page, bool is_write, int err); - void folio_end_private_2(struct folio *folio); void folio_wait_private_2(struct folio *folio); int folio_wait_private_2_killable(struct folio *folio); diff --git a/mm/filemap.c b/mm/filemap.c index 2d3d70c64dfd..570bc8c3db87 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1628,36 +1628,6 @@ void folio_end_writeback(struct folio *folio) } EXPORT_SYMBOL(folio_end_writeback); -/* - * After completing I/O on a page, call this routine to update the page - * flags appropriately - */ -void page_endio(struct page *page, bool is_write, int err) -{ - struct folio *folio = page_folio(page); - - if (!is_write) { - if (!err) { - folio_mark_uptodate(folio); - } else { - folio_clear_uptodate(folio); - folio_set_error(folio); - } - folio_unlock(folio); - } else { - if (err) { - struct address_space *mapping; - - folio_set_error(folio); - mapping = folio_mapping(folio); - if (mapping) - mapping_set_error(mapping, err); - } - folio_end_writeback(folio); - } -} -EXPORT_SYMBOL_GPL(page_endio); - /** * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it. * @folio: The folio to lock -- cgit v1.2.3 From 0d625446d0a451a683a357799912b9e688629707 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 16:58:53 +0200 Subject: backing_dev: remove current->backing_dev_info Patch series "cleanup the filemap / direct I/O interaction", v4. This series cleans up some of the generic write helper calling conventions and the page cache writeback / invalidation for direct I/O. This is a spinoff from the no-bufferhead kernel project, for which we'll want to an use iomap based buffered write path in the block layer. This patch (of 12): The last user of current->backing_dev_info disappeared in commit b9b1335e6403 ("remove bdi_congested() and wb_congested() and related functions"). Remove the field and all assignments to it. Link: https://lkml.kernel.org/r/20230601145904.1385409-1-hch@lst.de Link: https://lkml.kernel.org/r/20230601145904.1385409-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Christian Brauner Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Darrick J. Wong Acked-by: Theodore Ts'o Cc: Al Viro Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Chao Yu Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jens Axboe Cc: Matthew Wilcox Cc: Miklos Szeredi Cc: Miklos Szeredi Cc: Trond Myklebust Cc: Xiubo Li Signed-off-by: Andrew Morton --- fs/btrfs/file.c | 6 +----- fs/ceph/file.c | 4 ---- fs/ext4/file.c | 2 -- fs/f2fs/file.c | 2 -- fs/fuse/file.c | 4 ---- fs/gfs2/file.c | 2 -- fs/nfs/file.c | 5 +---- fs/ntfs/file.c | 2 -- fs/ntfs3/file.c | 3 --- fs/xfs/xfs_file.c | 4 ---- include/linux/sched.h | 3 --- mm/filemap.c | 3 --- 12 files changed, 2 insertions(+), 38 deletions(-) (limited to 'mm/filemap.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f649647392e0..ecd43ab66fa6 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1145,7 +1145,6 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) return -EAGAIN; - current->backing_dev_info = inode_to_bdi(inode); ret = file_remove_privs(file); if (ret) return ret; @@ -1165,10 +1164,8 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, loff_t end_pos = round_up(pos + count, fs_info->sectorsize); ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos); - if (ret) { - current->backing_dev_info = NULL; + if (ret) return ret; - } } return 0; @@ -1689,7 +1686,6 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, if (sync) atomic_dec(&inode->sync_writers); - current->backing_dev_info = NULL; return num_written; } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index f4d8bf7dec88..c8ef72f723ba 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1791,9 +1791,6 @@ retry_snap: else ceph_start_io_write(inode); - /* We can write back this queue in page reclaim */ - current->backing_dev_info = inode_to_bdi(inode); - if (iocb->ki_flags & IOCB_APPEND) { err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); if (err < 0) @@ -1940,7 +1937,6 @@ out: ceph_end_io_write(inode); out_unlocked: ceph_free_cap_flush(prealloc_cf); - current->backing_dev_info = NULL; return written ? written : err; } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index d101b3b0c7da..bc430270c23c 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -285,9 +285,7 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, if (ret <= 0) goto out; - current->backing_dev_info = inode_to_bdi(inode); ret = generic_perform_write(iocb, from); - current->backing_dev_info = NULL; out: inode_unlock(inode); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5ac53d2627d2..4f423d367a44 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4517,9 +4517,7 @@ static ssize_t f2fs_buffered_write_iter(struct kiocb *iocb, if (iocb->ki_flags & IOCB_NOWAIT) return -EOPNOTSUPP; - current->backing_dev_info = inode_to_bdi(inode); ret = generic_perform_write(iocb, from); - current->backing_dev_info = NULL; if (ret > 0) { iocb->ki_pos += ret; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 89d97f6188e0..97d435874b14 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1362,9 +1362,6 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) writethrough: inode_lock(inode); - /* We can write back this queue in page reclaim */ - current->backing_dev_info = inode_to_bdi(inode); - err = generic_write_checks(iocb, from); if (err <= 0) goto out; @@ -1409,7 +1406,6 @@ writethrough: iocb->ki_pos += written; } out: - current->backing_dev_info = NULL; inode_unlock(inode); if (written > 0) written = generic_write_sync(iocb, written); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 300844f50dcd..904a0d6ac1a1 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1041,11 +1041,9 @@ retry: goto out_unlock; } - current->backing_dev_info = inode_to_bdi(inode); pagefault_disable(); ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); pagefault_enable(); - current->backing_dev_info = NULL; if (ret > 0) { iocb->ki_pos += ret; written += ret; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index f0edf5a36237..665ce3fc62ea 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -648,11 +648,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) since = filemap_sample_wb_err(file->f_mapping); nfs_start_io_write(inode); result = generic_write_checks(iocb, from); - if (result > 0) { - current->backing_dev_info = inode_to_bdi(inode); + if (result > 0) result = generic_perform_write(iocb, from); - current->backing_dev_info = NULL; - } nfs_end_io_write(inode); if (result <= 0) goto out; diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index c481b14e4fd9..e296f804a9c4 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1911,11 +1911,9 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) inode_lock(vi); /* We can write back this queue in page reclaim. */ - current->backing_dev_info = inode_to_bdi(vi); err = ntfs_prepare_file_for_write(iocb, from); if (iov_iter_count(from) && !err) written = ntfs_perform_write(file, from, iocb->ki_pos); - current->backing_dev_info = NULL; inode_unlock(vi); iocb->ki_pos += written; if (likely(written > 0)) diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 9a3d55c367d9..86d16a2c8339 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -820,7 +820,6 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) if (!pages) return -ENOMEM; - current->backing_dev_info = inode_to_bdi(inode); err = file_remove_privs(file); if (err) goto out; @@ -993,8 +992,6 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) out: kfree(pages); - current->backing_dev_info = NULL; - if (err < 0) return err; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index aede746541f8..431c3fd0e2b5 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -717,9 +717,6 @@ write_retry: if (ret) goto out; - /* We can write back this queue in page reclaim */ - current->backing_dev_info = inode_to_bdi(inode); - trace_xfs_file_buffered_write(iocb, from); ret = iomap_file_buffered_write(iocb, from, &xfs_buffered_write_iomap_ops); @@ -753,7 +750,6 @@ write_retry: goto write_retry; } - current->backing_dev_info = NULL; out: if (iolock) xfs_iunlock(ip, iolock); diff --git a/include/linux/sched.h b/include/linux/sched.h index eed5d65b8d1f..54780571fe9a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -41,7 +41,6 @@ /* task_struct member predeclarations (sorted alphabetically): */ struct audit_context; -struct backing_dev_info; struct bio_list; struct blk_plug; struct bpf_local_storage; @@ -1186,8 +1185,6 @@ struct task_struct { /* VM state: */ struct reclaim_state *reclaim_state; - struct backing_dev_info *backing_dev_info; - struct io_context *io_context; #ifdef CONFIG_COMPACTION diff --git a/mm/filemap.c b/mm/filemap.c index 570bc8c3db87..0d371ed91a68 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3964,8 +3964,6 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ssize_t err; ssize_t status; - /* We can write back this queue in page reclaim */ - current->backing_dev_info = inode_to_bdi(inode); err = file_remove_privs(file); if (err) goto out; @@ -4026,7 +4024,6 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) iocb->ki_pos += written; } out: - current->backing_dev_info = NULL; return written ? written : err; } EXPORT_SYMBOL(__generic_file_write_iter); -- cgit v1.2.3 From 182c25e9c157f37bd0ab5a82fe2417e2223df459 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 16:58:55 +0200 Subject: filemap: update ki_pos in generic_perform_write All callers of generic_perform_write need to updated ki_pos, move it into common code. Link: https://lkml.kernel.org/r/20230601145904.1385409-4-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Xiubo Li Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Acked-by: Theodore Ts'o Acked-by: Darrick J. Wong Cc: Al Viro Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Chao Yu Cc: Christian Brauner Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jens Axboe Cc: Johannes Thumshirn Cc: Matthew Wilcox Cc: Miklos Szeredi Cc: Miklos Szeredi Cc: Trond Myklebust Signed-off-by: Andrew Morton --- fs/ceph/file.c | 2 -- fs/ext4/file.c | 9 +++------ fs/f2fs/file.c | 1 - fs/nfs/file.c | 1 - mm/filemap.c | 8 ++++---- 5 files changed, 7 insertions(+), 14 deletions(-) (limited to 'mm/filemap.c') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index c8ef72f723ba..767f4dfe7def 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1891,8 +1891,6 @@ retry_snap: * can not run at the same time */ written = generic_perform_write(iocb, from); - if (likely(written >= 0)) - iocb->ki_pos = pos + written; ceph_end_io_write(inode); } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index bc430270c23c..ea0ada3985cb 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -289,12 +289,9 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, out: inode_unlock(inode); - if (likely(ret > 0)) { - iocb->ki_pos += ret; - ret = generic_write_sync(iocb, ret); - } - - return ret; + if (unlikely(ret <= 0)) + return ret; + return generic_write_sync(iocb, ret); } static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 4f423d367a44..7134fe8bd008 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4520,7 +4520,6 @@ static ssize_t f2fs_buffered_write_iter(struct kiocb *iocb, ret = generic_perform_write(iocb, from); if (ret > 0) { - iocb->ki_pos += ret; f2fs_update_iostat(F2FS_I_SB(inode), inode, APP_BUFFERED_IO, ret); } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 665ce3fc62ea..e8bb4c48a321 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -655,7 +655,6 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) goto out; written = result; - iocb->ki_pos += written; nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); if (mntflags & NFS_MOUNT_WRITE_EAGER) { diff --git a/mm/filemap.c b/mm/filemap.c index 0d371ed91a68..3a80a69fa9fa 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3930,7 +3930,10 @@ again: balance_dirty_pages_ratelimited(mapping); } while (iov_iter_count(i)); - return written ? written : status; + if (!written) + return status; + iocb->ki_pos += written; + return written; } EXPORT_SYMBOL(generic_perform_write); @@ -4007,7 +4010,6 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) endbyte = pos + status - 1; err = filemap_write_and_wait_range(mapping, pos, endbyte); if (err == 0) { - iocb->ki_pos = endbyte + 1; written += status; invalidate_mapping_pages(mapping, pos >> PAGE_SHIFT, @@ -4020,8 +4022,6 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) } } else { written = generic_perform_write(iocb, from); - if (likely(written > 0)) - iocb->ki_pos += written; } out: return written ? written : err; -- cgit v1.2.3 From 3c435a0fe35c220bec442dffff04a64aacf952b0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 16:58:56 +0200 Subject: filemap: add a kiocb_write_and_wait helper Factor out a helper that does filemap_write_and_wait_range for the range covered by a read kiocb, or returns -EAGAIN if the kiocb is marked as nowait and there would be pages to write. Link: https://lkml.kernel.org/r/20230601145904.1385409-5-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Acked-by: Darrick J. Wong Cc: Al Viro Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Chao Yu Cc: Christian Brauner Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jens Axboe Cc: Johannes Thumshirn Cc: Matthew Wilcox Cc: Miklos Szeredi Cc: Miklos Szeredi Cc: Theodore Ts'o Cc: Trond Myklebust Cc: Xiubo Li Signed-off-by: Andrew Morton --- block/fops.c | 18 +++--------------- include/linux/pagemap.h | 2 ++ mm/filemap.c | 30 ++++++++++++++++++------------ 3 files changed, 23 insertions(+), 27 deletions(-) (limited to 'mm/filemap.c') diff --git a/block/fops.c b/block/fops.c index 58d0aebc7313..575171049c5d 100644 --- a/block/fops.c +++ b/block/fops.c @@ -576,21 +576,9 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) goto reexpand; /* skip atime */ if (iocb->ki_flags & IOCB_DIRECT) { - struct address_space *mapping = iocb->ki_filp->f_mapping; - - if (iocb->ki_flags & IOCB_NOWAIT) { - if (filemap_range_needs_writeback(mapping, pos, - pos + count - 1)) { - ret = -EAGAIN; - goto reexpand; - } - } else { - ret = filemap_write_and_wait_range(mapping, pos, - pos + count - 1); - if (ret < 0) - goto reexpand; - } - + ret = kiocb_write_and_wait(iocb, count); + if (ret < 0) + goto reexpand; file_accessed(iocb->ki_filp); ret = blkdev_direct_IO(iocb, to); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index c1ae5ebc375f..b6a12ca108b7 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -30,6 +30,7 @@ static inline void invalidate_remote_inode(struct inode *inode) int invalidate_inode_pages2(struct address_space *mapping); int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end); + int write_inode_now(struct inode *, int sync); int filemap_fdatawrite(struct address_space *); int filemap_flush(struct address_space *); @@ -54,6 +55,7 @@ int filemap_check_errors(struct address_space *mapping); void __filemap_set_wb_err(struct address_space *mapping, int err); int filemap_fdatawrite_wbc(struct address_space *mapping, struct writeback_control *wbc); +int kiocb_write_and_wait(struct kiocb *iocb, size_t count); static inline int filemap_write_and_wait(struct address_space *mapping) { diff --git a/mm/filemap.c b/mm/filemap.c index 3a80a69fa9fa..5566e10ca1a7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2735,6 +2735,21 @@ put_folios: } EXPORT_SYMBOL_GPL(filemap_read); +int kiocb_write_and_wait(struct kiocb *iocb, size_t count) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + loff_t pos = iocb->ki_pos; + loff_t end = pos + count - 1; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (filemap_range_needs_writeback(mapping, pos, end)) + return -EAGAIN; + return 0; + } + + return filemap_write_and_wait_range(mapping, pos, end); +} + /** * generic_file_read_iter - generic filesystem read routine * @iocb: kernel I/O control block @@ -2770,18 +2785,9 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - if (iocb->ki_flags & IOCB_NOWAIT) { - if (filemap_range_needs_writeback(mapping, iocb->ki_pos, - iocb->ki_pos + count - 1)) - return -EAGAIN; - } else { - retval = filemap_write_and_wait_range(mapping, - iocb->ki_pos, - iocb->ki_pos + count - 1); - if (retval < 0) - return retval; - } - + retval = kiocb_write_and_wait(iocb, count); + if (retval < 0) + return retval; file_accessed(file); retval = mapping->a_ops->direct_IO(iocb, iter); -- cgit v1.2.3 From e003f74afbd2feadbb9ffbf9135e2d2fb5d320a5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 16:58:57 +0200 Subject: filemap: add a kiocb_invalidate_pages helper Factor out a helper that calls filemap_write_and_wait_range and invalidate_inode_pages2_range for the range covered by a write kiocb or returns -EAGAIN if the kiocb is marked as nowait and there would be pages to write or invalidate. Link: https://lkml.kernel.org/r/20230601145904.1385409-6-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Acked-by: Darrick J. Wong Cc: Al Viro Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Chao Yu Cc: Christian Brauner Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jens Axboe Cc: Johannes Thumshirn Cc: Matthew Wilcox Cc: Miklos Szeredi Cc: Miklos Szeredi Cc: Theodore Ts'o Cc: Trond Myklebust Cc: Xiubo Li Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 1 + mm/filemap.c | 48 ++++++++++++++++++++++++++++-------------------- 2 files changed, 29 insertions(+), 20 deletions(-) (limited to 'mm/filemap.c') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index b6a12ca108b7..7b66a67dba51 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -30,6 +30,7 @@ static inline void invalidate_remote_inode(struct inode *inode) int invalidate_inode_pages2(struct address_space *mapping); int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end); +int kiocb_invalidate_pages(struct kiocb *iocb, size_t count); int write_inode_now(struct inode *, int sync); int filemap_fdatawrite(struct address_space *); diff --git a/mm/filemap.c b/mm/filemap.c index 5566e10ca1a7..6ba6233c4bbb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2750,6 +2750,33 @@ int kiocb_write_and_wait(struct kiocb *iocb, size_t count) return filemap_write_and_wait_range(mapping, pos, end); } +int kiocb_invalidate_pages(struct kiocb *iocb, size_t count) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + loff_t pos = iocb->ki_pos; + loff_t end = pos + count - 1; + int ret; + + if (iocb->ki_flags & IOCB_NOWAIT) { + /* we could block if there are any pages in the range */ + if (filemap_range_has_page(mapping, pos, end)) + return -EAGAIN; + } else { + ret = filemap_write_and_wait_range(mapping, pos, end); + if (ret) + return ret; + } + + /* + * After a write we want buffered reads to be sure to go to disk to get + * the new data. We invalidate clean cached page from the region we're + * about to write. We do this *before* the write so that we can return + * without clobbering -EIOCBQUEUED from ->direct_IO(). + */ + return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, + end >> PAGE_SHIFT); +} + /** * generic_file_read_iter - generic filesystem read routine * @iocb: kernel I/O control block @@ -3793,30 +3820,11 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) write_len = iov_iter_count(from); end = (pos + write_len - 1) >> PAGE_SHIFT; - if (iocb->ki_flags & IOCB_NOWAIT) { - /* If there are pages to writeback, return */ - if (filemap_range_has_page(file->f_mapping, pos, - pos + write_len - 1)) - return -EAGAIN; - } else { - written = filemap_write_and_wait_range(mapping, pos, - pos + write_len - 1); - if (written) - goto out; - } - - /* - * After a write we want buffered reads to be sure to go to disk to get - * the new data. We invalidate clean cached page from the region we're - * about to write. We do this *before* the write so that we can return - * without clobbering -EIOCBQUEUED from ->direct_IO(). - */ - written = invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, end); /* * If a page can not be invalidated, return 0 to fall back * to buffered write. */ + written = kiocb_invalidate_pages(iocb, write_len); if (written) { if (written == -EBUSY) return 0; -- cgit v1.2.3 From c402a9a9430b670926decbb284b756ee6f47c1ec Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 16:58:58 +0200 Subject: filemap: add a kiocb_invalidate_post_direct_write helper Add a helper to invalidate page cache after a dio write. Link: https://lkml.kernel.org/r/20230601145904.1385409-7-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Acked-by: Darrick J. Wong Cc: Al Viro Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Chao Yu Cc: Christian Brauner Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jens Axboe Cc: Johannes Thumshirn Cc: Matthew Wilcox Cc: Miklos Szeredi Cc: Miklos Szeredi Cc: Theodore Ts'o Cc: Trond Myklebust Cc: Xiubo Li Signed-off-by: Andrew Morton --- fs/direct-io.c | 10 ++-------- fs/iomap/direct-io.c | 12 ++---------- include/linux/fs.h | 5 ----- include/linux/pagemap.h | 1 + mm/filemap.c | 37 ++++++++++++++++++++----------------- 5 files changed, 25 insertions(+), 40 deletions(-) (limited to 'mm/filemap.c') diff --git a/fs/direct-io.c b/fs/direct-io.c index 0b380bb8a81e..4f9069aee0fe 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -285,14 +285,8 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags) * zeros from unwritten extents. */ if (flags & DIO_COMPLETE_INVALIDATE && - ret > 0 && dio_op == REQ_OP_WRITE && - dio->inode->i_mapping->nrpages) { - err = invalidate_inode_pages2_range(dio->inode->i_mapping, - offset >> PAGE_SHIFT, - (offset + ret - 1) >> PAGE_SHIFT); - if (err) - dio_warn_stale_pagecache(dio->iocb->ki_filp); - } + ret > 0 && dio_op == REQ_OP_WRITE) + kiocb_invalidate_post_direct_write(dio->iocb, ret); inode_dio_end(dio->inode); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 6207a59d2162..0795c54a745b 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -81,7 +81,6 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) { const struct iomap_dio_ops *dops = dio->dops; struct kiocb *iocb = dio->iocb; - struct inode *inode = file_inode(iocb->ki_filp); loff_t offset = iocb->ki_pos; ssize_t ret = dio->error; @@ -108,15 +107,8 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) * ->end_io() when necessary, otherwise a racing buffer read would cache * zeros from unwritten extents. */ - if (!dio->error && dio->size && - (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) { - int err; - err = invalidate_inode_pages2_range(inode->i_mapping, - offset >> PAGE_SHIFT, - (offset + dio->size - 1) >> PAGE_SHIFT); - if (err) - dio_warn_stale_pagecache(iocb->ki_filp); - } + if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE)) + kiocb_invalidate_post_direct_write(iocb, dio->size); inode_dio_end(file_inode(iocb->ki_filp)); diff --git a/include/linux/fs.h b/include/linux/fs.h index 86b50271b4f7..4f196f827d9d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2843,11 +2843,6 @@ static inline void inode_dio_end(struct inode *inode) wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); } -/* - * Warn about a page cache invalidation failure diring a direct I/O write. - */ -void dio_warn_stale_pagecache(struct file *filp); - extern void inode_set_flags(struct inode *inode, unsigned int flags, unsigned int mask); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 7b66a67dba51..716953ee1ebd 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -31,6 +31,7 @@ int invalidate_inode_pages2(struct address_space *mapping); int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end); int kiocb_invalidate_pages(struct kiocb *iocb, size_t count); +void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count); int write_inode_now(struct inode *, int sync); int filemap_fdatawrite(struct address_space *); diff --git a/mm/filemap.c b/mm/filemap.c index 6ba6233c4bbb..b45506f74133 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3789,7 +3789,7 @@ EXPORT_SYMBOL(read_cache_page_gfp); /* * Warn about a page cache invalidation failure during a direct I/O write. */ -void dio_warn_stale_pagecache(struct file *filp) +static void dio_warn_stale_pagecache(struct file *filp) { static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); char pathname[128]; @@ -3806,19 +3806,23 @@ void dio_warn_stale_pagecache(struct file *filp) } } +void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + + if (mapping->nrpages && + invalidate_inode_pages2_range(mapping, + iocb->ki_pos >> PAGE_SHIFT, + (iocb->ki_pos + count - 1) >> PAGE_SHIFT)) + dio_warn_stale_pagecache(iocb->ki_filp); +} + ssize_t generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - loff_t pos = iocb->ki_pos; - ssize_t written; - size_t write_len; - pgoff_t end; - - write_len = iov_iter_count(from); - end = (pos + write_len - 1) >> PAGE_SHIFT; + struct address_space *mapping = iocb->ki_filp->f_mapping; + size_t write_len = iov_iter_count(from); + ssize_t written; /* * If a page can not be invalidated, return 0 to fall back @@ -3828,7 +3832,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) if (written) { if (written == -EBUSY) return 0; - goto out; + return written; } written = mapping->a_ops->direct_IO(iocb, from); @@ -3850,11 +3854,11 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) * * Skip invalidation for async writes or if mapping has no pages. */ - if (written > 0 && mapping->nrpages && - invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end)) - dio_warn_stale_pagecache(file); - if (written > 0) { + struct inode *inode = mapping->host; + loff_t pos = iocb->ki_pos; + + kiocb_invalidate_post_direct_write(iocb, written); pos += written; write_len -= written; if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { @@ -3865,7 +3869,6 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) } if (written != -EIOCBQUEUED) iov_iter_revert(from, write_len - iov_iter_count(from)); -out: return written; } EXPORT_SYMBOL(generic_file_direct_write); -- cgit v1.2.3 From 44fff0fa08ec5a6d9d5fb05443a36d854d0ece4d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 16:59:01 +0200 Subject: fs: factor out a direct_write_fallback helper Add a helper dealing with handling the syncing of a buffered write fallback for direct I/O. Link: https://lkml.kernel.org/r/20230601145904.1385409-10-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Miklos Szeredi Reviewed-by: Darrick J. Wong Cc: Al Viro Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Chao Yu Cc: Christian Brauner Cc: Hannes Reinecke Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jens Axboe Cc: Johannes Thumshirn Cc: Matthew Wilcox Cc: Miklos Szeredi Cc: Theodore Ts'o Cc: Trond Myklebust Cc: Xiubo Li Signed-off-by: Andrew Morton --- fs/libfs.c | 41 +++++++++++++++++++++++++++++++++ include/linux/fs.h | 2 ++ mm/filemap.c | 66 +++++++++++++----------------------------------------- 3 files changed, 58 insertions(+), 51 deletions(-) (limited to 'mm/filemap.c') diff --git a/fs/libfs.c b/fs/libfs.c index 89cf614a3271..5b851315eeed 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1613,3 +1613,44 @@ u64 inode_query_iversion(struct inode *inode) return cur >> I_VERSION_QUERIED_SHIFT; } EXPORT_SYMBOL(inode_query_iversion); + +ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter, + ssize_t direct_written, ssize_t buffered_written) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + loff_t pos = iocb->ki_pos - buffered_written; + loff_t end = iocb->ki_pos - 1; + int err; + + /* + * If the buffered write fallback returned an error, we want to return + * the number of bytes which were written by direct I/O, or the error + * code if that was zero. + * + * Note that this differs from normal direct-io semantics, which will + * return -EFOO even if some bytes were written. + */ + if (unlikely(buffered_written < 0)) { + if (direct_written) + return direct_written; + return buffered_written; + } + + /* + * We need to ensure that the page cache pages are written to disk and + * invalidated to preserve the expected O_DIRECT semantics. + */ + err = filemap_write_and_wait_range(mapping, pos, end); + if (err < 0) { + /* + * We don't know how much we wrote, so just return the number of + * bytes which were direct-written + */ + if (direct_written) + return direct_written; + return err; + } + invalidate_mapping_pages(mapping, pos >> PAGE_SHIFT, end >> PAGE_SHIFT); + return direct_written + buffered_written; +} +EXPORT_SYMBOL_GPL(direct_write_fallback); diff --git a/include/linux/fs.h b/include/linux/fs.h index 4f196f827d9d..c363f8687c7e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2744,6 +2744,8 @@ extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *); ssize_t generic_perform_write(struct kiocb *, struct iov_iter *); +ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter, + ssize_t direct_written, ssize_t buffered_written); ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags); diff --git a/mm/filemap.c b/mm/filemap.c index b45506f74133..916b7c6444fe 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3979,23 +3979,19 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - ssize_t written = 0; - ssize_t err; - ssize_t status; + struct inode *inode = mapping->host; + ssize_t ret; - err = file_remove_privs(file); - if (err) - goto out; + ret = file_remove_privs(file); + if (ret) + return ret; - err = file_update_time(file); - if (err) - goto out; + ret = file_update_time(file); + if (ret) + return ret; if (iocb->ki_flags & IOCB_DIRECT) { - loff_t pos, endbyte; - - written = generic_file_direct_write(iocb, from); + ret = generic_file_direct_write(iocb, from); /* * If the write stopped short of completing, fall back to * buffered writes. Some filesystems do this for writes to @@ -4003,45 +3999,13 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) * not succeed (even if it did, DAX does not handle dirty * page-cache pages correctly). */ - if (written < 0 || !iov_iter_count(from) || IS_DAX(inode)) - goto out; - - pos = iocb->ki_pos; - status = generic_perform_write(iocb, from); - /* - * If generic_perform_write() returned a synchronous error - * then we want to return the number of bytes which were - * direct-written, or the error code if that was zero. Note - * that this differs from normal direct-io semantics, which - * will return -EFOO even if some bytes were written. - */ - if (unlikely(status < 0)) { - err = status; - goto out; - } - /* - * We need to ensure that the page cache pages are written to - * disk and invalidated to preserve the expected O_DIRECT - * semantics. - */ - endbyte = pos + status - 1; - err = filemap_write_and_wait_range(mapping, pos, endbyte); - if (err == 0) { - written += status; - invalidate_mapping_pages(mapping, - pos >> PAGE_SHIFT, - endbyte >> PAGE_SHIFT); - } else { - /* - * We don't know how much we wrote, so just return - * the number of bytes which were direct-written - */ - } - } else { - written = generic_perform_write(iocb, from); + if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode)) + return ret; + return direct_write_fallback(iocb, from, ret, + generic_perform_write(iocb, from)); } -out: - return written ? written : err; + + return generic_perform_write(iocb, from); } EXPORT_SYMBOL(__generic_file_write_iter); -- cgit v1.2.3 From 9425c591e06a9ab27a145ba655fb50532cf0bcc9 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 2 Jun 2023 15:57:47 -0700 Subject: page cache: fix page_cache_next/prev_miss off by one Ackerley Tng reported an issue with hugetlbfs fallocate here[1]. The issue showed up after the conversion of hugetlb page cache lookup code to use page_cache_next_miss. Code in hugetlb fallocate, userfaultfd and GUP is now using page_cache_next_miss to determine if a page is present the page cache. The following statement is used. present = page_cache_next_miss(mapping, index, 1) != index; There are two issues with page_cache_next_miss when used in this way. 1) If the passed value for index is equal to the 'wrap-around' value, the same index will always be returned. This wrap-around value is 0, so 0 will be returned even if page is present at index 0. 2) If there is no gap in the range passed, the last index in the range will be returned. When passed a range of 1 as above, the passed index value will be returned even if the page is present. The end result is the statement above will NEVER indicate a page is present in the cache, even if it is. As noted by Ackerley in [1], users can see this by hugetlb fallocate incorrectly returning EEXIST if pages are already present in the file. In addition, hugetlb pages will not be included in core dumps if they need to be brought in via GUP. userfaultfd UFFDIO_COPY also uses this code and will not notice pages already present in the cache. It may try to allocate a new page and potentially return ENOMEM as opposed to EEXIST. Both page_cache_next_miss and page_cache_prev_miss have similar issues. Fix by: - Check for index equal to 'wrap-around' value and do not exit early. - If no gap is found in range, return index outside range. - Update function description to say 'wrap-around' value could be returned if passed as index. [1] https://lore.kernel.org/linux-mm/cover.1683069252.git.ackerleytng@google.com/ Link: https://lkml.kernel.org/r/20230602225747.103865-2-mike.kravetz@oracle.com Fixes: d0ce0e47b323 ("mm/hugetlb: convert hugetlb fault paths to use alloc_hugetlb_folio()") Signed-off-by: Mike Kravetz Reported-by: Ackerley Tng Reviewed-by: Ackerley Tng Tested-by: Ackerley Tng Cc: Erdem Aktas Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Sidhartha Kumar Cc: Vishal Annapurve Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton --- mm/filemap.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index b4c9bd368b7e..83dda76d1fc3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1760,7 +1760,9 @@ bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm, * * Return: The index of the gap if found, otherwise an index outside the * range specified (in which case 'return - index >= max_scan' will be true). - * In the rare case of index wrap-around, 0 will be returned. + * In the rare case of index wrap-around, 0 will be returned. 0 will also + * be returned if index == 0 and there is a gap at the index. We can not + * wrap-around if passed index == 0. */ pgoff_t page_cache_next_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) @@ -1770,12 +1772,13 @@ pgoff_t page_cache_next_miss(struct address_space *mapping, while (max_scan--) { void *entry = xas_next(&xas); if (!entry || xa_is_value(entry)) - break; - if (xas.xa_index == 0) - break; + return xas.xa_index; + if (xas.xa_index == 0 && index != 0) + return xas.xa_index; } - return xas.xa_index; + /* No gaps in range and no wrap-around, return index beyond range */ + return xas.xa_index + 1; } EXPORT_SYMBOL(page_cache_next_miss); @@ -1796,7 +1799,9 @@ EXPORT_SYMBOL(page_cache_next_miss); * * Return: The index of the gap if found, otherwise an index outside the * range specified (in which case 'index - return >= max_scan' will be true). - * In the rare case of wrap-around, ULONG_MAX will be returned. + * In the rare case of wrap-around, ULONG_MAX will be returned. ULONG_MAX + * will also be returned if index == ULONG_MAX and there is a gap at the + * index. We can not wrap-around if passed index == ULONG_MAX. */ pgoff_t page_cache_prev_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) @@ -1806,12 +1811,13 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, while (max_scan--) { void *entry = xas_prev(&xas); if (!entry || xa_is_value(entry)) - break; - if (xas.xa_index == ULONG_MAX) - break; + return xas.xa_index; + if (xas.xa_index == ULONG_MAX && index != ULONG_MAX) + return xas.xa_index; } - return xas.xa_index; + /* No gaps in range and no wrap-around, return index beyond range */ + return xas.xa_index - 1; } EXPORT_SYMBOL(page_cache_prev_miss); -- cgit v1.2.3 From 0cb8fd4d14165a7e654048e43983d86f75b90879 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 8 Jun 2023 18:08:20 -0700 Subject: mm/migrate: remove cruft from migration_entry_wait()s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit migration_entry_wait_on_locked() does not need to take a mapped pte pointer, its callers can do the unmap first. Annotate it with __releases(ptl) to reduce sparse warnings. Fold __migration_entry_wait_huge() into migration_entry_wait_huge(). Fold __migration_entry_wait() into migration_entry_wait(), preferring the tighter pte_offset_map_lock() to pte_offset_map() and pte_lockptr(). Link: https://lkml.kernel.org/r/b0e2a532-cdf2-561b-e999-f3b13b8d6d3@google.com Signed-off-by: Hugh Dickins Reviewed-by: Alistair Popple Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christophe Leroy Cc: Christoph Hellwig Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Ira Weiny Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Ryan Roberts Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Signed-off-by: Andrew Morton --- include/linux/migrate.h | 4 ++-- include/linux/swapops.h | 17 +++-------------- mm/filemap.c | 13 ++++--------- mm/migrate.c | 37 +++++++++++++------------------------ 4 files changed, 22 insertions(+), 49 deletions(-) (limited to 'mm/filemap.c') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 6de5756d8533..711dd9412561 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -75,8 +75,8 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode); int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src); -void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep, - spinlock_t *ptl); +void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) + __releases(ptl); void folio_migrate_flags(struct folio *newfolio, struct folio *folio); void folio_migrate_copy(struct folio *newfolio, struct folio *folio); int folio_migrate_mapping(struct address_space *mapping, diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 3a451b7afcb3..4c932cb45e0b 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -332,15 +332,9 @@ static inline bool is_migration_entry_dirty(swp_entry_t entry) return false; } -extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, - spinlock_t *ptl); extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address); -#ifdef CONFIG_HUGETLB_PAGE -extern void __migration_entry_wait_huge(struct vm_area_struct *vma, - pte_t *ptep, spinlock_t *ptl); extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte); -#endif /* CONFIG_HUGETLB_PAGE */ #else /* CONFIG_MIGRATION */ static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) { @@ -362,15 +356,10 @@ static inline int is_migration_entry(swp_entry_t swp) return 0; } -static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, - spinlock_t *ptl) { } static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, - unsigned long address) { } -#ifdef CONFIG_HUGETLB_PAGE -static inline void __migration_entry_wait_huge(struct vm_area_struct *vma, - pte_t *ptep, spinlock_t *ptl) { } -static inline void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) { } -#endif /* CONFIG_HUGETLB_PAGE */ + unsigned long address) { } +static inline void migration_entry_wait_huge(struct vm_area_struct *vma, + pte_t *pte) { } static inline int is_writable_migration_entry(swp_entry_t entry) { return 0; diff --git a/mm/filemap.c b/mm/filemap.c index 916b7c6444fe..e0259fb823a5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1362,8 +1362,6 @@ repeat: /** * migration_entry_wait_on_locked - Wait for a migration entry to be removed * @entry: migration swap entry. - * @ptep: mapped pte pointer. Will return with the ptep unmapped. Only required - * for pte entries, pass NULL for pmd entries. * @ptl: already locked ptl. This function will drop the lock. * * Wait for a migration entry referencing the given page to be removed. This is @@ -1372,13 +1370,13 @@ repeat: * should be called while holding the ptl for the migration entry referencing * the page. * - * Returns after unmapping and unlocking the pte/ptl with pte_unmap_unlock(). + * Returns after unlocking the ptl. * * This follows the same logic as folio_wait_bit_common() so see the comments * there. */ -void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep, - spinlock_t *ptl) +void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) + __releases(ptl) { struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; @@ -1412,10 +1410,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep, * a valid reference to the page, and it must take the ptl to remove the * migration entry. So the page is valid until the ptl is dropped. */ - if (ptep) - pte_unmap_unlock(ptep, ptl); - else - spin_unlock(ptl); + spin_unlock(ptl); for (;;) { unsigned int flags; diff --git a/mm/migrate.c b/mm/migrate.c index 30b5ce10935e..c1f2c40441e1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -296,14 +296,18 @@ void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked) * get to the page and wait until migration is finished. * When we return from this function the fault will be retried. */ -void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, - spinlock_t *ptl) +void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) { + spinlock_t *ptl; + pte_t *ptep; pte_t pte; swp_entry_t entry; - spin_lock(ptl); + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); pte = *ptep; + pte_unmap(ptep); + if (!is_swap_pte(pte)) goto out; @@ -311,18 +315,10 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, if (!is_migration_entry(entry)) goto out; - migration_entry_wait_on_locked(entry, ptep, ptl); + migration_entry_wait_on_locked(entry, ptl); return; out: - pte_unmap_unlock(ptep, ptl); -} - -void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, - unsigned long address) -{ - spinlock_t *ptl = pte_lockptr(mm, pmd); - pte_t *ptep = pte_offset_map(pmd, address); - __migration_entry_wait(mm, ptep, ptl); + spin_unlock(ptl); } #ifdef CONFIG_HUGETLB_PAGE @@ -332,9 +328,9 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, * * This function will release the vma lock before returning. */ -void __migration_entry_wait_huge(struct vm_area_struct *vma, - pte_t *ptep, spinlock_t *ptl) +void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *ptep) { + spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, ptep); pte_t pte; hugetlb_vma_assert_locked(vma); @@ -352,16 +348,9 @@ void __migration_entry_wait_huge(struct vm_area_struct *vma, * lock release in migration_entry_wait_on_locked(). */ hugetlb_vma_unlock_read(vma); - migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl); + migration_entry_wait_on_locked(pte_to_swp_entry(pte), ptl); } } - -void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) -{ - spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, pte); - - __migration_entry_wait_huge(vma, pte, ptl); -} #endif #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION @@ -372,7 +361,7 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) ptl = pmd_lock(mm, pmd); if (!is_pmd_migration_entry(*pmd)) goto unlock; - migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), NULL, ptl); + migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), ptl); return; unlock: spin_unlock(ptl); -- cgit v1.2.3 From 65747aaf42b7db6acb8e57a2b8e9959928f404dd Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 8 Jun 2023 18:11:29 -0700 Subject: mm/filemap: allow pte_offset_map_lock() to fail MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit filemap_map_pages() allow pte_offset_map_lock() to fail; and remove the pmd_devmap_trans_unstable() check from filemap_map_pmd(), which can safely return to filemap_map_pages() and let pte_offset_map_lock() discover that. Link: https://lkml.kernel.org/r/54607cf4-ddb6-7ef3-043-1d2de1a9a71@google.com Signed-off-by: Hugh Dickins Cc: Alistair Popple Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christophe Leroy Cc: Christoph Hellwig Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Ira Weiny Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Ryan Roberts Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Signed-off-by: Andrew Morton --- mm/filemap.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index e0259fb823a5..1893048ec9ff 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3414,13 +3414,6 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio, if (pmd_none(*vmf->pmd)) pmd_install(mm, vmf->pmd, &vmf->prealloc_pte); - /* See comment in handle_pte_fault() */ - if (pmd_devmap_trans_unstable(vmf->pmd)) { - folio_unlock(folio); - folio_put(folio); - return true; - } - return false; } @@ -3507,6 +3500,11 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); + if (!vmf->pte) { + folio_unlock(folio); + folio_put(folio); + goto out; + } do { again: page = folio_file_page(folio, xas.xa_index); -- cgit v1.2.3 From c33c794828f21217f72ce6fc140e0d34e0d56bff Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 12 Jun 2023 16:15:45 +0100 Subject: mm: ptep_get() conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert all instances of direct pte_t* dereferencing to instead use ptep_get() helper. This means that by default, the accesses change from a C dereference to a READ_ONCE(). This is technically the correct thing to do since where pgtables are modified by HW (for access/dirty) they are volatile and therefore we should always ensure READ_ONCE() semantics. But more importantly, by always using the helper, it can be overridden by the architecture to fully encapsulate the contents of the pte. Arch code is deliberately not converted, as the arch code knows best. It is intended that arch code (arm64) will override the default with its own implementation that can (e.g.) hide certain bits from the core code, or determine young/dirty status by mixing in state from another source. Conversion was done using Coccinelle: ---- // $ make coccicheck \ // COCCI=ptepget.cocci \ // SPFLAGS="--include-headers" \ // MODE=patch virtual patch @ depends on patch @ pte_t *v; @@ - *v + ptep_get(v) ---- Then reviewed and hand-edited to avoid multiple unnecessary calls to ptep_get(), instead opting to store the result of a single call in a variable, where it is correct to do so. This aims to negate any cost of READ_ONCE() and will benefit arch-overrides that may be more complex. Included is a fix for an issue in an earlier version of this patch that was pointed out by kernel test robot. The issue arose because config MMU=n elides definition of the ptep helper functions, including ptep_get(). HUGETLB_PAGE=n configs still define a simple huge_ptep_clear_flush() for linking purposes, which dereferences the ptep. So when both configs are disabled, this caused a build error because ptep_get() is not defined. Fix by continuing to do a direct dereference when MMU=n. This is safe because for this config the arch code cannot be trying to virtualize the ptes because none of the ptep helpers are defined. Link: https://lkml.kernel.org/r/20230612151545.3317766-4-ryan.roberts@arm.com Reported-by: kernel test robot Link: https://lore.kernel.org/oe-kbuild-all/202305120142.yXsNEo6H-lkp@intel.com/ Signed-off-by: Ryan Roberts Cc: Adrian Hunter Cc: Alexander Potapenko Cc: Alexander Shishkin Cc: Alex Williamson Cc: Al Viro Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Christian Brauner Cc: Christoph Hellwig Cc: Daniel Vetter Cc: Dave Airlie Cc: Dimitri Sivanich Cc: Dmitry Vyukov Cc: Ian Rogers Cc: Jason Gunthorpe Cc: Jérôme Glisse Cc: Jiri Olsa Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Mark Rutland Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Namhyung Kim Cc: Naoya Horiguchi Cc: Oleksandr Tyshchenko Cc: Pavel Tatashin Cc: Roman Gushchin Cc: SeongJae Park Cc: Shakeel Butt Cc: Uladzislau Rezki (Sony) Cc: Vincenzo Frascino Cc: Yu Zhao Signed-off-by: Andrew Morton --- drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c | 8 +- drivers/misc/sgi-gru/grufault.c | 2 +- drivers/vfio/vfio_iommu_type1.c | 7 +- drivers/xen/privcmd.c | 2 +- fs/proc/task_mmu.c | 33 +++---- fs/userfaultfd.c | 6 +- include/linux/hugetlb.h | 4 + include/linux/mm_inline.h | 2 +- include/linux/pgtable.h | 6 +- kernel/events/uprobes.c | 2 +- mm/damon/ops-common.c | 2 +- mm/damon/paddr.c | 2 +- mm/damon/vaddr.c | 10 ++- mm/filemap.c | 2 +- mm/gup.c | 21 +++-- mm/highmem.c | 12 +-- mm/hmm.c | 2 +- mm/huge_memory.c | 4 +- mm/hugetlb.c | 2 +- mm/hugetlb_vmemmap.c | 6 +- mm/kasan/init.c | 9 +- mm/kasan/shadow.c | 10 +-- mm/khugepaged.c | 22 ++--- mm/ksm.c | 22 ++--- mm/madvise.c | 6 +- mm/mapping_dirty_helpers.c | 4 +- mm/memcontrol.c | 4 +- mm/memory-failure.c | 26 +++--- mm/memory.c | 100 +++++++++++---------- mm/mempolicy.c | 6 +- mm/migrate.c | 14 +-- mm/migrate_device.c | 15 ++-- mm/mincore.c | 2 +- mm/mlock.c | 6 +- mm/mprotect.c | 8 +- mm/mremap.c | 2 +- mm/page_table_check.c | 4 +- mm/page_vma_mapped.c | 27 +++--- mm/pgtable-generic.c | 2 +- mm/rmap.c | 34 ++++--- mm/sparse-vmemmap.c | 8 +- mm/swap_state.c | 8 +- mm/swapfile.c | 20 +++-- mm/userfaultfd.c | 4 +- mm/vmalloc.c | 6 +- mm/vmscan.c | 14 +-- virt/kvm/kvm_main.c | 11 ++- 47 files changed, 301 insertions(+), 228 deletions(-) (limited to 'mm/filemap.c') diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c index 56279908ed30..01e271b6ad21 100644 --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c @@ -1681,7 +1681,9 @@ static int igt_mmap_gpu(void *arg) static int check_present_pte(pte_t *pte, unsigned long addr, void *data) { - if (!pte_present(*pte) || pte_none(*pte)) { + pte_t ptent = ptep_get(pte); + + if (!pte_present(ptent) || pte_none(ptent)) { pr_err("missing PTE:%lx\n", (addr - (unsigned long)data) >> PAGE_SHIFT); return -EINVAL; @@ -1692,7 +1694,9 @@ static int check_present_pte(pte_t *pte, unsigned long addr, void *data) static int check_absent_pte(pte_t *pte, unsigned long addr, void *data) { - if (pte_present(*pte) && !pte_none(*pte)) { + pte_t ptent = ptep_get(pte); + + if (pte_present(ptent) && !pte_none(ptent)) { pr_err("present PTE:%lx; expected to be revoked\n", (addr - (unsigned long)data) >> PAGE_SHIFT); return -EINVAL; diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c index 378cf02a2aa1..629edb6486de 100644 --- a/drivers/misc/sgi-gru/grufault.c +++ b/drivers/misc/sgi-gru/grufault.c @@ -228,7 +228,7 @@ static int atomic_pte_lookup(struct vm_area_struct *vma, unsigned long vaddr, goto err; #ifdef CONFIG_X86_64 if (unlikely(pmd_large(*pmdp))) - pte = *(pte_t *) pmdp; + pte = ptep_get((pte_t *)pmdp); else #endif pte = *pte_offset_kernel(pmdp, vaddr); diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 306e6f1d1c70..ebe0ad31d0b0 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -514,6 +514,7 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, bool write_fault) { pte_t *ptep; + pte_t pte; spinlock_t *ptl; int ret; @@ -536,10 +537,12 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, return ret; } - if (write_fault && !pte_write(*ptep)) + pte = ptep_get(ptep); + + if (write_fault && !pte_write(pte)) ret = -EFAULT; else - *pfn = pte_pfn(*ptep); + *pfn = pte_pfn(pte); pte_unmap_unlock(ptep, ptl); return ret; diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index e2f580e30a86..f447cd37cc4c 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -949,7 +949,7 @@ static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) */ static int is_mapped_fn(pte_t *pte, unsigned long addr, void *data) { - return pte_none(*pte) ? 0 : -EBUSY; + return pte_none(ptep_get(pte)) ? 0 : -EBUSY; } static int privcmd_vma_range_is_mapped( diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 0d63b6a0f0d8..507cd4e59d07 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -538,13 +538,14 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; bool migration = false, young = false, dirty = false; + pte_t ptent = ptep_get(pte); - if (pte_present(*pte)) { - page = vm_normal_page(vma, addr, *pte); - young = pte_young(*pte); - dirty = pte_dirty(*pte); - } else if (is_swap_pte(*pte)) { - swp_entry_t swpent = pte_to_swp_entry(*pte); + if (pte_present(ptent)) { + page = vm_normal_page(vma, addr, ptent); + young = pte_young(ptent); + dirty = pte_dirty(ptent); + } else if (is_swap_pte(ptent)) { + swp_entry_t swpent = pte_to_swp_entry(ptent); if (!non_swap_entry(swpent)) { int mapcount; @@ -732,11 +733,12 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; struct page *page = NULL; + pte_t ptent = ptep_get(pte); - if (pte_present(*pte)) { - page = vm_normal_page(vma, addr, *pte); - } else if (is_swap_pte(*pte)) { - swp_entry_t swpent = pte_to_swp_entry(*pte); + if (pte_present(ptent)) { + page = vm_normal_page(vma, addr, ptent); + } else if (is_swap_pte(ptent)) { + swp_entry_t swpent = pte_to_swp_entry(ptent); if (is_pfn_swap_entry(swpent)) page = pfn_swap_entry_to_page(swpent); @@ -1105,7 +1107,7 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, * Documentation/admin-guide/mm/soft-dirty.rst for full description * of how soft-dirty works. */ - pte_t ptent = *pte; + pte_t ptent = ptep_get(pte); if (pte_present(ptent)) { pte_t old_pte; @@ -1194,7 +1196,7 @@ out: return 0; } for (; addr != end; pte++, addr += PAGE_SIZE) { - ptent = *pte; + ptent = ptep_get(pte); if (cp->type == CLEAR_REFS_SOFT_DIRTY) { clear_soft_dirty(vma, addr, pte); @@ -1550,7 +1552,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, for (; addr < end; pte++, addr += PAGE_SIZE) { pagemap_entry_t pme; - pme = pte_to_pagemap_entry(pm, vma, addr, *pte); + pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte)); err = add_to_pagemap(addr, &pme, pm); if (err) break; @@ -1893,10 +1895,11 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, return 0; } do { - struct page *page = can_gather_numa_stats(*pte, vma, addr); + pte_t ptent = ptep_get(pte); + struct page *page = can_gather_numa_stats(ptent, vma, addr); if (!page) continue; - gather_stats(page, md, pte_dirty(*pte), 1); + gather_stats(page, md, pte_dirty(ptent), 1); } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(orig_pte, ptl); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index ca83423f8d54..478e2b169c13 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -335,6 +335,7 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, pud_t *pud; pmd_t *pmd, _pmd; pte_t *pte; + pte_t ptent; bool ret = true; mmap_assert_locked(mm); @@ -374,9 +375,10 @@ again: * changes under us. PTE markers should be handled the same as none * ptes here. */ - if (pte_none_mostly(*pte)) + ptent = ptep_get(pte); + if (pte_none_mostly(ptent)) ret = true; - if (!pte_write(*pte) && (reason & VM_UFFD_WP)) + if (!pte_write(ptent) && (reason & VM_UFFD_WP)) ret = true; pte_unmap(pte); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 21f942025fec..beb7c63d2871 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1185,7 +1185,11 @@ static inline void hugetlb_count_sub(long l, struct mm_struct *mm) static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { +#ifdef CONFIG_MMU + return ptep_get(ptep); +#else return *ptep; +#endif } static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 0e1d239a882c..08c2bcefcb2b 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -555,7 +555,7 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, bool arm_uffd_pte = false; /* The current status of the pte should be "cleared" before calling */ - WARN_ON_ONCE(!pte_none(*pte)); + WARN_ON_ONCE(!pte_none(ptep_get(pte))); /* * NOTE: userfaultfd_wp_unpopulated() doesn't need this whole diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index fc06f6419661..5063b482e34f 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -231,7 +231,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { - pte_t pte = *ptep; + pte_t pte = ptep_get(ptep); int r = 1; if (!pte_young(pte)) r = 0; @@ -318,7 +318,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep) { - pte_t pte = *ptep; + pte_t pte = ptep_get(ptep); pte_clear(mm, address, ptep); page_table_check_pte_clear(mm, address, pte); return pte; @@ -519,7 +519,7 @@ extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, struct mm_struct; static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) { - pte_t old_pte = *ptep; + pte_t old_pte = ptep_get(ptep); set_pte_at(mm, address, ptep, pte_wrprotect(old_pte)); } #endif diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 607d742caa61..f0ac5b874919 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -192,7 +192,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, inc_mm_counter(mm, MM_ANONPAGES); } - flush_cache_page(vma, addr, pte_pfn(*pvmw.pte)); + flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte))); ptep_clear_flush_notify(vma, addr, pvmw.pte); if (new_page) set_pte_at_notify(mm, addr, pvmw.pte, diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index d4ab81229136..e940802a15a4 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -39,7 +39,7 @@ struct folio *damon_get_folio(unsigned long pfn) void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr) { - struct folio *folio = damon_get_folio(pte_pfn(*pte)); + struct folio *folio = damon_get_folio(pte_pfn(ptep_get(pte))); if (!folio) return; diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 5b3a3463d078..40801e38fcf0 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -89,7 +89,7 @@ static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma, while (page_vma_mapped_walk(&pvmw)) { addr = pvmw.address; if (pvmw.pte) { - *accessed = pte_young(*pvmw.pte) || + *accessed = pte_young(ptep_get(pvmw.pte)) || !folio_test_idle(folio) || mmu_notifier_test_young(vma->vm_mm, addr); } else { diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index e814f66dfc2e..2fcc9731528a 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -323,7 +323,7 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr, walk->action = ACTION_AGAIN; return 0; } - if (!pte_present(*pte)) + if (!pte_present(ptep_get(pte))) goto out; damon_ptep_mkold(pte, walk->vma, addr); out: @@ -433,6 +433,7 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { pte_t *pte; + pte_t ptent; spinlock_t *ptl; struct folio *folio; struct damon_young_walk_private *priv = walk->private; @@ -471,12 +472,13 @@ regular_page: walk->action = ACTION_AGAIN; return 0; } - if (!pte_present(*pte)) + ptent = ptep_get(pte); + if (!pte_present(ptent)) goto out; - folio = damon_get_folio(pte_pfn(*pte)); + folio = damon_get_folio(pte_pfn(ptent)); if (!folio) goto out; - if (pte_young(*pte) || !folio_test_idle(folio) || + if (pte_young(ptent) || !folio_test_idle(folio) || mmu_notifier_test_young(walk->mm, addr)) priv->young = true; *priv->folio_sz = folio_size(folio); diff --git a/mm/filemap.c b/mm/filemap.c index 1893048ec9ff..00933089b8b6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3523,7 +3523,7 @@ again: * handled in the specific fault path, and it'll prohibit the * fault-around logic. */ - if (!pte_none(*vmf->pte)) + if (!pte_none(ptep_get(vmf->pte))) goto unlock; /* We're about to handle the fault */ diff --git a/mm/gup.c b/mm/gup.c index 838db6c0bfc2..38986e522d34 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -477,13 +477,14 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, pte_t *pte, unsigned int flags) { if (flags & FOLL_TOUCH) { - pte_t entry = *pte; + pte_t orig_entry = ptep_get(pte); + pte_t entry = orig_entry; if (flags & FOLL_WRITE) entry = pte_mkdirty(entry); entry = pte_mkyoung(entry); - if (!pte_same(*pte, entry)) { + if (!pte_same(orig_entry, entry)) { set_pte_at(vma->vm_mm, address, pte, entry); update_mmu_cache(vma, address, pte); } @@ -549,7 +550,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, ptep = pte_offset_map_lock(mm, pmd, address, &ptl); if (!ptep) return no_page_table(vma, flags); - pte = *ptep; + pte = ptep_get(ptep); if (!pte_present(pte)) goto no_page; if (pte_protnone(pte) && !gup_can_follow_protnone(flags)) @@ -821,6 +822,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, pud_t *pud; pmd_t *pmd; pte_t *pte; + pte_t entry; int ret = -EFAULT; /* user gate pages are read-only */ @@ -844,16 +846,17 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, pte = pte_offset_map(pmd, address); if (!pte) return -EFAULT; - if (pte_none(*pte)) + entry = ptep_get(pte); + if (pte_none(entry)) goto unmap; *vma = get_gate_vma(mm); if (!page) goto out; - *page = vm_normal_page(*vma, address, *pte); + *page = vm_normal_page(*vma, address, entry); if (!*page) { - if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte))) + if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(entry))) goto unmap; - *page = pte_page(*pte); + *page = pte_page(entry); } ret = try_grab_page(*page, gup_flags); if (unlikely(ret)) @@ -2496,7 +2499,7 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, } if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) || - unlikely(pte_val(pte) != pte_val(*ptep))) { + unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { gup_put_folio(folio, 1, flags); goto pte_unmap; } @@ -2693,7 +2696,7 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, if (!folio) return 0; - if (unlikely(pte_val(pte) != pte_val(*ptep))) { + if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { gup_put_folio(folio, refs, flags); return 0; } diff --git a/mm/highmem.c b/mm/highmem.c index db251e77f98f..e19269093a93 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -161,7 +161,7 @@ struct page *__kmap_to_page(void *vaddr) /* kmap() mappings */ if (WARN_ON_ONCE(addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP))) - return pte_page(pkmap_page_table[PKMAP_NR(addr)]); + return pte_page(ptep_get(&pkmap_page_table[PKMAP_NR(addr)])); /* kmap_local_page() mappings */ if (WARN_ON_ONCE(base >= __fix_to_virt(FIX_KMAP_END) && @@ -191,6 +191,7 @@ static void flush_all_zero_pkmaps(void) for (i = 0; i < LAST_PKMAP; i++) { struct page *page; + pte_t ptent; /* * zero means we don't have anything to do, @@ -203,7 +204,8 @@ static void flush_all_zero_pkmaps(void) pkmap_count[i] = 0; /* sanity check */ - BUG_ON(pte_none(pkmap_page_table[i])); + ptent = ptep_get(&pkmap_page_table[i]); + BUG_ON(pte_none(ptent)); /* * Don't need an atomic fetch-and-clear op here; @@ -212,7 +214,7 @@ static void flush_all_zero_pkmaps(void) * getting the kmap_lock (which is held here). * So no dangers, even with speculative execution. */ - page = pte_page(pkmap_page_table[i]); + page = pte_page(ptent); pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]); set_page_address(page, NULL); @@ -511,7 +513,7 @@ static inline bool kmap_high_unmap_local(unsigned long vaddr) { #ifdef ARCH_NEEDS_KMAP_HIGH_GET if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) { - kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)])); + kunmap_high(pte_page(ptep_get(&pkmap_page_table[PKMAP_NR(vaddr)]))); return true; } #endif @@ -548,7 +550,7 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot) idx = arch_kmap_local_map_idx(kmap_local_idx_push(), pfn); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); kmap_pte = kmap_get_pte(vaddr, idx); - BUG_ON(!pte_none(*kmap_pte)); + BUG_ON(!pte_none(ptep_get(kmap_pte))); pteval = pfn_pte(pfn, prot); arch_kmap_local_set_pte(&init_mm, vaddr, kmap_pte, pteval); arch_kmap_local_post_map(vaddr, pteval); diff --git a/mm/hmm.c b/mm/hmm.c index b1a9159d7c92..855e25e59d8f 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -228,7 +228,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, struct hmm_range *range = hmm_vma_walk->range; unsigned int required_fault; unsigned long cpu_flags; - pte_t pte = *ptep; + pte_t pte = ptep_get(ptep); uint64_t pfn_req_flags = *hmm_pfn; if (pte_none_mostly(pte)) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 76f970aa5b4d..e94fe292f30a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2063,7 +2063,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, entry = pte_mkspecial(entry); if (pmd_uffd_wp(old_pmd)) entry = pte_mkuffd_wp(entry); - VM_BUG_ON(!pte_none(*pte)); + VM_BUG_ON(!pte_none(ptep_get(pte))); set_pte_at(mm, addr, pte, entry); pte++; } @@ -2257,7 +2257,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = pte_mkuffd_wp(entry); page_add_anon_rmap(page + i, vma, addr, false); } - VM_BUG_ON(!pte_none(*pte)); + VM_BUG_ON(!pte_none(ptep_get(pte))); set_pte_at(mm, addr, pte, entry); pte++; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1d3d8a61b336..d76574425da3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7246,7 +7246,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, pte = (pte_t *)pmd_alloc(mm, pud, addr); } } - BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte)); + BUG_ON(pte && pte_present(ptep_get(pte)) && !pte_huge(ptep_get(pte))); return pte; } diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index f42079b73f82..c2007ef5e9b0 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -105,7 +105,7 @@ static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, * remapping (which is calling @walk->remap_pte). */ if (!walk->reuse_page) { - walk->reuse_page = pte_page(*pte); + walk->reuse_page = pte_page(ptep_get(pte)); /* * Because the reuse address is part of the range that we are * walking, skip the reuse address range. @@ -239,7 +239,7 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, * to the tail pages. */ pgprot_t pgprot = PAGE_KERNEL_RO; - struct page *page = pte_page(*pte); + struct page *page = pte_page(ptep_get(pte)); pte_t entry; /* Remapping the head page requires r/w */ @@ -286,7 +286,7 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, struct page *page; void *to; - BUG_ON(pte_page(*pte) != walk->reuse_page); + BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page); page = list_first_entry(walk->vmemmap_pages, struct page, lru); list_del(&page->lru); diff --git a/mm/kasan/init.c b/mm/kasan/init.c index cc64ed6858c6..dcfec277e839 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -286,7 +286,7 @@ static void kasan_free_pte(pte_t *pte_start, pmd_t *pmd) for (i = 0; i < PTRS_PER_PTE; i++) { pte = pte_start + i; - if (!pte_none(*pte)) + if (!pte_none(ptep_get(pte))) return; } @@ -343,16 +343,19 @@ static void kasan_remove_pte_table(pte_t *pte, unsigned long addr, unsigned long end) { unsigned long next; + pte_t ptent; for (; addr < end; addr = next, pte++) { next = (addr + PAGE_SIZE) & PAGE_MASK; if (next > end) next = end; - if (!pte_present(*pte)) + ptent = ptep_get(pte); + + if (!pte_present(ptent)) continue; - if (WARN_ON(!kasan_early_shadow_page_entry(*pte))) + if (WARN_ON(!kasan_early_shadow_page_entry(ptent))) continue; pte_clear(&init_mm, addr, pte); } diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 3e62728ae25d..dd772f9d0f08 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -226,7 +226,7 @@ static bool shadow_mapped(unsigned long addr) if (pmd_bad(*pmd)) return true; pte = pte_offset_kernel(pmd, addr); - return !pte_none(*pte); + return !pte_none(ptep_get(pte)); } static int __meminit kasan_mem_notifier(struct notifier_block *nb, @@ -317,7 +317,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, unsigned long page; pte_t pte; - if (likely(!pte_none(*ptep))) + if (likely(!pte_none(ptep_get(ptep)))) return 0; page = __get_free_page(GFP_KERNEL); @@ -328,7 +328,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL); spin_lock(&init_mm.page_table_lock); - if (likely(pte_none(*ptep))) { + if (likely(pte_none(ptep_get(ptep)))) { set_pte_at(&init_mm, addr, ptep, pte); page = 0; } @@ -418,11 +418,11 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, { unsigned long page; - page = (unsigned long)__va(pte_pfn(*ptep) << PAGE_SHIFT); + page = (unsigned long)__va(pte_pfn(ptep_get(ptep)) << PAGE_SHIFT); spin_lock(&init_mm.page_table_lock); - if (likely(!pte_none(*ptep))) { + if (likely(!pte_none(ptep_get(ptep)))) { pte_clear(&init_mm, addr, ptep); free_page(page); } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 881669e738c0..0b4f00712895 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -511,7 +511,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte, struct folio *folio, *tmp; while (--_pte >= pte) { - pte_t pteval = *_pte; + pte_t pteval = ptep_get(_pte); unsigned long pfn; if (pte_none(pteval)) @@ -555,7 +555,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { - pte_t pteval = *_pte; + pte_t pteval = ptep_get(_pte); if (pte_none(pteval) || (pte_present(pteval) && is_zero_pfn(pte_pfn(pteval)))) { ++none_or_zero; @@ -699,7 +699,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { - pteval = *_pte; + pteval = ptep_get(_pte); if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); if (is_zero_pfn(pte_pfn(pteval))) { @@ -797,7 +797,7 @@ static int __collapse_huge_page_copy(pte_t *pte, */ for (_pte = pte, _address = address; _pte < pte + HPAGE_PMD_NR; _pte++, page++, _address += PAGE_SIZE) { - pteval = *_pte; + pteval = ptep_get(_pte); if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { clear_user_highpage(page, _address); continue; @@ -1274,7 +1274,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { - pte_t pteval = *_pte; + pte_t pteval = ptep_get(_pte); if (is_swap_pte(pteval)) { ++unmapped; if (!cc->is_khugepaged || @@ -1650,18 +1650,19 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { struct page *page; + pte_t ptent = ptep_get(pte); /* empty pte, skip */ - if (pte_none(*pte)) + if (pte_none(ptent)) continue; /* page swapped out, abort */ - if (!pte_present(*pte)) { + if (!pte_present(ptent)) { result = SCAN_PTE_NON_PRESENT; goto abort; } - page = vm_normal_page(vma, addr, *pte); + page = vm_normal_page(vma, addr, ptent); if (WARN_ON_ONCE(page && is_zone_device_page(page))) page = NULL; /* @@ -1677,10 +1678,11 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { struct page *page; + pte_t ptent = ptep_get(pte); - if (pte_none(*pte)) + if (pte_none(ptent)) continue; - page = vm_normal_page(vma, addr, *pte); + page = vm_normal_page(vma, addr, ptent); if (WARN_ON_ONCE(page && is_zone_device_page(page))) goto abort; page_remove_rmap(page, vma, false); diff --git a/mm/ksm.c b/mm/ksm.c index 3dc15459dd20..d995779dc1fe 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -429,15 +429,17 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex struct page *page = NULL; spinlock_t *ptl; pte_t *pte; + pte_t ptent; int ret; pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (!pte) return 0; - if (pte_present(*pte)) { - page = vm_normal_page(walk->vma, addr, *pte); - } else if (!pte_none(*pte)) { - swp_entry_t entry = pte_to_swp_entry(*pte); + ptent = ptep_get(pte); + if (pte_present(ptent)) { + page = vm_normal_page(walk->vma, addr, ptent); + } else if (!pte_none(ptent)) { + swp_entry_t entry = pte_to_swp_entry(ptent); /* * As KSM pages remain KSM pages until freed, no need to wait @@ -1085,6 +1087,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, int err = -EFAULT; struct mmu_notifier_range range; bool anon_exclusive; + pte_t entry; pvmw.address = page_address_in_vma(page, vma); if (pvmw.address == -EFAULT) @@ -1102,10 +1105,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, goto out_unlock; anon_exclusive = PageAnonExclusive(page); - if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) || + entry = ptep_get(pvmw.pte); + if (pte_write(entry) || pte_dirty(entry) || anon_exclusive || mm_tlb_flush_pending(mm)) { - pte_t entry; - swapped = PageSwapCache(page); flush_cache_page(vma, pvmw.address, page_to_pfn(page)); /* @@ -1147,7 +1149,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry); } - *orig_pte = *pvmw.pte; + *orig_pte = entry; err = 0; out_unlock: @@ -1204,7 +1206,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!ptep) goto out_mn; - if (!pte_same(*ptep, orig_pte)) { + if (!pte_same(ptep_get(ptep), orig_pte)) { pte_unmap_unlock(ptep, ptl); goto out_mn; } @@ -1231,7 +1233,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, dec_mm_counter(mm, MM_ANONPAGES); } - flush_cache_page(vma, addr, pte_pfn(*ptep)); + flush_cache_page(vma, addr, pte_pfn(ptep_get(ptep))); /* * No need to notify as we are replacing a read only page with another * read only page with the same content. diff --git a/mm/madvise.c b/mm/madvise.c index 9b3c9610052f..886f06066622 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -207,7 +207,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, break; } - pte = *ptep; + pte = ptep_get(ptep); if (!is_swap_pte(pte)) continue; entry = pte_to_swp_entry(pte); @@ -438,7 +438,7 @@ regular_folio: flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); for (; addr < end; pte++, addr += PAGE_SIZE) { - ptent = *pte; + ptent = ptep_get(pte); if (pte_none(ptent)) continue; @@ -642,7 +642,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); for (; addr != end; pte++, addr += PAGE_SIZE) { - ptent = *pte; + ptent = ptep_get(pte); if (pte_none(ptent)) continue; diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index 87b4beeda4fa..a26dd8bcfcdb 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -35,7 +35,7 @@ static int wp_pte(pte_t *pte, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct wp_walk *wpwalk = walk->private; - pte_t ptent = *pte; + pte_t ptent = ptep_get(pte); if (pte_write(ptent)) { pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte); @@ -91,7 +91,7 @@ static int clean_record_pte(pte_t *pte, unsigned long addr, { struct wp_walk *wpwalk = walk->private; struct clean_walk *cwalk = to_clean_walk(wpwalk); - pte_t ptent = *pte; + pte_t ptent = ptep_get(pte); if (pte_dirty(ptent)) { pgoff_t pgoff = ((addr - walk->vma->vm_start) >> PAGE_SHIFT) + diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 77d8d2d14fcf..93056918e956 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6025,7 +6025,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, if (!pte) return 0; for (; addr != end; pte++, addr += PAGE_SIZE) - if (get_mctgt_type(vma, addr, *pte, NULL)) + if (get_mctgt_type(vma, addr, ptep_get(pte), NULL)) mc.precharge++; /* increment precharge temporarily */ pte_unmap_unlock(pte - 1, ptl); cond_resched(); @@ -6246,7 +6246,7 @@ retry: if (!pte) return 0; for (; addr != end; addr += PAGE_SIZE) { - pte_t ptent = *(pte++); + pte_t ptent = ptep_get(pte++); bool device = false; swp_entry_t ent; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index d5116f0eb1b6..e245191e6b04 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -6,16 +6,16 @@ * High level machine check handler. Handles pages reported by the * hardware as being corrupted usually due to a multi-bit ECC memory or cache * failure. - * + * * In addition there is a "soft offline" entry point that allows stop using * not-yet-corrupted-by-suspicious pages without killing anything. * * Handles page cache pages in various states. The tricky part - * here is that we can access any page asynchronously in respect to - * other VM users, because memory failures could happen anytime and - * anywhere. This could violate some of their assumptions. This is why - * this code has to be extremely careful. Generally it tries to use - * normal locking rules, as in get the standard locks, even if that means + * here is that we can access any page asynchronously in respect to + * other VM users, because memory failures could happen anytime and + * anywhere. This could violate some of their assumptions. This is why + * this code has to be extremely careful. Generally it tries to use + * normal locking rules, as in get the standard locks, even if that means * the error handling takes potentially a long time. * * It can be very tempting to add handling for obscure cases here. @@ -25,12 +25,12 @@ * https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/ * - The case actually shows up as a frequent (top 10) page state in * tools/mm/page-types when running a real workload. - * + * * There are several operations here with exponential complexity because - * of unsuitable VM data structures. For example the operation to map back - * from RMAP chains to processes has to walk the complete process list and + * of unsuitable VM data structures. For example the operation to map back + * from RMAP chains to processes has to walk the complete process list and * has non linear complexity with the number. But since memory corruptions - * are rare we hope to get away with this. This avoids impacting the core + * are rare we hope to get away with this. This avoids impacting the core * VM. */ @@ -386,6 +386,7 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, pud_t *pud; pmd_t *pmd; pte_t *pte; + pte_t ptent; VM_BUG_ON_VMA(address == -EFAULT, vma); pgd = pgd_offset(vma->vm_mm, address); @@ -407,7 +408,8 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, pte = pte_offset_map(pmd, address); if (!pte) return 0; - if (pte_present(*pte) && pte_devmap(*pte)) + ptent = ptep_get(pte); + if (pte_present(ptent) && pte_devmap(ptent)) ret = PAGE_SHIFT; pte_unmap(pte); return ret; @@ -799,7 +801,7 @@ static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr, goto out; for (; addr != end; ptep++, addr += PAGE_SIZE) { - ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT, + ret = check_hwpoisoned_entry(ptep_get(ptep), addr, PAGE_SHIFT, hwp->pfn, &hwp->tk); if (ret == 1) break; diff --git a/mm/memory.c b/mm/memory.c index 63c30f58142b..3d78b552866d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -699,15 +699,17 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, struct page *page, unsigned long address, pte_t *ptep) { + pte_t orig_pte; pte_t pte; swp_entry_t entry; + orig_pte = ptep_get(ptep); pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot))); - if (pte_swp_soft_dirty(*ptep)) + if (pte_swp_soft_dirty(orig_pte)) pte = pte_mksoft_dirty(pte); - entry = pte_to_swp_entry(*ptep); - if (pte_swp_uffd_wp(*ptep)) + entry = pte_to_swp_entry(orig_pte); + if (pte_swp_uffd_wp(orig_pte)) pte = pte_mkuffd_wp(pte); else if (is_writable_device_exclusive_entry(entry)) pte = maybe_mkwrite(pte_mkdirty(pte), vma); @@ -744,7 +746,7 @@ static int try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma, unsigned long addr) { - swp_entry_t entry = pte_to_swp_entry(*src_pte); + swp_entry_t entry = pte_to_swp_entry(ptep_get(src_pte)); struct page *page = pfn_swap_entry_to_page(entry); if (trylock_page(page)) { @@ -768,9 +770,10 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, struct vm_area_struct *src_vma, unsigned long addr, int *rss) { unsigned long vm_flags = dst_vma->vm_flags; - pte_t pte = *src_pte; + pte_t orig_pte = ptep_get(src_pte); + pte_t pte = orig_pte; struct page *page; - swp_entry_t entry = pte_to_swp_entry(pte); + swp_entry_t entry = pte_to_swp_entry(orig_pte); if (likely(!non_swap_entry(entry))) { if (swap_duplicate(entry) < 0) @@ -785,8 +788,8 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, spin_unlock(&mmlist_lock); } /* Mark the swap entry as shared. */ - if (pte_swp_exclusive(*src_pte)) { - pte = pte_swp_clear_exclusive(*src_pte); + if (pte_swp_exclusive(orig_pte)) { + pte = pte_swp_clear_exclusive(orig_pte); set_pte_at(src_mm, addr, src_pte, pte); } rss[MM_SWAPENTS]++; @@ -805,9 +808,9 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, entry = make_readable_migration_entry( swp_offset(entry)); pte = swp_entry_to_pte(entry); - if (pte_swp_soft_dirty(*src_pte)) + if (pte_swp_soft_dirty(orig_pte)) pte = pte_swp_mksoft_dirty(pte); - if (pte_swp_uffd_wp(*src_pte)) + if (pte_swp_uffd_wp(orig_pte)) pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } @@ -840,7 +843,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, entry = make_readable_device_private_entry( swp_offset(entry)); pte = swp_entry_to_pte(entry); - if (pte_swp_uffd_wp(*src_pte)) + if (pte_swp_uffd_wp(orig_pte)) pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } @@ -904,7 +907,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma /* All done, just insert the new page copy in the child */ pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot); pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); - if (userfaultfd_pte_wp(dst_vma, *src_pte)) + if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte))) /* Uffd-wp needs to be delivered to dest pte as well */ pte = pte_mkuffd_wp(pte); set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); @@ -922,7 +925,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, { struct mm_struct *src_mm = src_vma->vm_mm; unsigned long vm_flags = src_vma->vm_flags; - pte_t pte = *src_pte; + pte_t pte = ptep_get(src_pte); struct page *page; struct folio *folio; @@ -1002,6 +1005,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, struct mm_struct *src_mm = src_vma->vm_mm; pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; + pte_t ptent; spinlock_t *src_ptl, *dst_ptl; int progress, ret = 0; int rss[NR_MM_COUNTERS]; @@ -1047,17 +1051,18 @@ again: spin_needbreak(src_ptl) || spin_needbreak(dst_ptl)) break; } - if (pte_none(*src_pte)) { + ptent = ptep_get(src_pte); + if (pte_none(ptent)) { progress++; continue; } - if (unlikely(!pte_present(*src_pte))) { + if (unlikely(!pte_present(ptent))) { ret = copy_nonpresent_pte(dst_mm, src_mm, dst_pte, src_pte, dst_vma, src_vma, addr, rss); if (ret == -EIO) { - entry = pte_to_swp_entry(*src_pte); + entry = pte_to_swp_entry(ptep_get(src_pte)); break; } else if (ret == -EBUSY) { break; @@ -1407,7 +1412,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); do { - pte_t ptent = *pte; + pte_t ptent = ptep_get(pte); struct page *page; if (pte_none(ptent)) @@ -1822,7 +1827,7 @@ static int validate_page_before_insert(struct page *page) static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { - if (!pte_none(*pte)) + if (!pte_none(ptep_get(pte))) return -EBUSY; /* Ok, finally just insert the thing.. */ get_page(page); @@ -2116,7 +2121,8 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, pte = get_locked_pte(mm, addr, &ptl); if (!pte) return VM_FAULT_OOM; - if (!pte_none(*pte)) { + entry = ptep_get(pte); + if (!pte_none(entry)) { if (mkwrite) { /* * For read faults on private mappings the PFN passed @@ -2128,11 +2134,11 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, * allocation and mapping invalidation so just skip the * update. */ - if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) { - WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte))); + if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) { + WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry))); goto out_unlock; } - entry = pte_mkyoung(*pte); + entry = pte_mkyoung(entry); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, addr, pte, entry, 1)) update_mmu_cache(vma, addr, pte); @@ -2344,7 +2350,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, return -ENOMEM; arch_enter_lazy_mmu_mode(); do { - BUG_ON(!pte_none(*pte)); + BUG_ON(!pte_none(ptep_get(pte))); if (!pfn_modify_allowed(pfn, prot)) { err = -EACCES; break; @@ -2585,7 +2591,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, if (fn) { do { - if (create || !pte_none(*pte)) { + if (create || !pte_none(ptep_get(pte))) { err = fn(pte++, addr, data); if (err) break; @@ -2787,7 +2793,7 @@ static inline int pte_unmap_same(struct vm_fault *vmf) #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION) if (sizeof(pte_t) > sizeof(unsigned long)) { spin_lock(vmf->ptl); - same = pte_same(*vmf->pte, vmf->orig_pte); + same = pte_same(ptep_get(vmf->pte), vmf->orig_pte); spin_unlock(vmf->ptl); } #endif @@ -2838,7 +2844,7 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src, pte_t entry; vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); - if (unlikely(!vmf->pte || !pte_same(*vmf->pte, vmf->orig_pte))) { + if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { /* * Other thread has already handled the fault * and update local tlb only @@ -2866,7 +2872,7 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src, /* Re-validate under PTL if the page is still mapped */ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); - if (unlikely(!vmf->pte || !pte_same(*vmf->pte, vmf->orig_pte))) { + if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { /* The PTE changed under us, update local tlb */ if (vmf->pte) update_mmu_tlb(vma, addr, vmf->pte); @@ -3114,7 +3120,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * Re-check the pte - we dropped the lock */ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); - if (likely(vmf->pte && pte_same(*vmf->pte, vmf->orig_pte))) { + if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { if (old_folio) { if (!folio_test_anon(old_folio)) { dec_mm_counter(mm, mm_counter_file(&old_folio->page)); @@ -3241,7 +3247,7 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) * We might have raced with another page fault while we released the * pte_offset_map_lock. */ - if (!pte_same(*vmf->pte, vmf->orig_pte)) { + if (!pte_same(ptep_get(vmf->pte), vmf->orig_pte)) { update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); return VM_FAULT_NOPAGE; @@ -3336,7 +3342,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) struct folio *folio = NULL; if (likely(!unshare)) { - if (userfaultfd_pte_wp(vma, *vmf->pte)) { + if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); return handle_userfault(vmf, VM_UFFD_WP); } @@ -3598,7 +3604,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (likely(vmf->pte && pte_same(*vmf->pte, vmf->orig_pte))) + if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte); if (vmf->pte) @@ -3643,7 +3649,7 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf) * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_SWAPIN_ERROR. * So is_pte_marker() check is not enough to safely drop the pte. */ - if (pte_same(vmf->orig_pte, *vmf->pte)) + if (pte_same(vmf->orig_pte, ptep_get(vmf->pte))) pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; @@ -3739,7 +3745,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!vmf->pte || - !pte_same(*vmf->pte, vmf->orig_pte))) + !pte_same(ptep_get(vmf->pte), + vmf->orig_pte))) goto unlock; /* @@ -3816,7 +3823,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (likely(vmf->pte && pte_same(*vmf->pte, vmf->orig_pte))) + if (likely(vmf->pte && + pte_same(ptep_get(vmf->pte), vmf->orig_pte))) ret = VM_FAULT_OOM; goto unlock; } @@ -3886,7 +3894,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (unlikely(!vmf->pte || !pte_same(*vmf->pte, vmf->orig_pte))) + if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) goto out_nomap; if (unlikely(!folio_test_uptodate(folio))) { @@ -4331,9 +4339,9 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) static bool vmf_pte_changed(struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID) - return !pte_same(*vmf->pte, vmf->orig_pte); + return !pte_same(ptep_get(vmf->pte), vmf->orig_pte); - return !pte_none(*vmf->pte); + return !pte_none(ptep_get(vmf->pte)); } /** @@ -4643,7 +4651,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf) * we don't have concurrent modification by hardware * followed by an update. */ - if (unlikely(pte_none(*vmf->pte))) + if (unlikely(pte_none(ptep_get(vmf->pte)))) ret = VM_FAULT_SIGBUS; else ret = VM_FAULT_NOPAGE; @@ -4699,7 +4707,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * the pfn may be screwed if the read is non atomic. */ spin_lock(vmf->ptl); - if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { + if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } @@ -4772,7 +4780,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) vmf->address, &vmf->ptl); if (unlikely(!vmf->pte)) goto out; - if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { + if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } @@ -4930,7 +4938,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) spin_lock(vmf->ptl); entry = vmf->orig_pte; - if (unlikely(!pte_same(*vmf->pte, entry))) { + if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) { update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); goto unlock; } @@ -5416,7 +5424,7 @@ int follow_pte(struct mm_struct *mm, unsigned long address, ptep = pte_offset_map_lock(mm, pmd, address, ptlp); if (!ptep) goto out; - if (!pte_present(*ptep)) + if (!pte_present(ptep_get(ptep))) goto unlock; *ptepp = ptep; return 0; @@ -5453,7 +5461,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address, ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); if (ret) return ret; - *pfn = pte_pfn(*ptep); + *pfn = pte_pfn(ptep_get(ptep)); pte_unmap_unlock(ptep, ptl); return 0; } @@ -5473,7 +5481,7 @@ int follow_phys(struct vm_area_struct *vma, if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) goto out; - pte = *ptep; + pte = ptep_get(ptep); if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; @@ -5517,7 +5525,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, retry: if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) return -EINVAL; - pte = *ptep; + pte = ptep_get(ptep); pte_unmap_unlock(ptep, ptl); prot = pgprot_val(pte_pgprot(pte)); @@ -5533,7 +5541,7 @@ retry: if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) goto out_unmap; - if (!pte_same(pte, *ptep)) { + if (!pte_same(pte, ptep_get(ptep))) { pte_unmap_unlock(ptep, ptl); iounmap(maddr); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0241bb64978b..edc25195f5bd 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -508,6 +508,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, unsigned long flags = qp->flags; bool has_unmovable = false; pte_t *pte, *mapped_pte; + pte_t ptent; spinlock_t *ptl; ptl = pmd_trans_huge_lock(pmd, vma); @@ -520,9 +521,10 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, return 0; } for (; addr != end; pte++, addr += PAGE_SIZE) { - if (!pte_present(*pte)) + ptent = ptep_get(pte); + if (!pte_present(ptent)) continue; - folio = vm_normal_folio(vma, addr, *pte); + folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; /* diff --git a/mm/migrate.c b/mm/migrate.c index 363562992046..ce35afdbc1e3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -188,6 +188,7 @@ static bool remove_migration_pte(struct folio *folio, while (page_vma_mapped_walk(&pvmw)) { rmap_t rmap_flags = RMAP_NONE; + pte_t old_pte; pte_t pte; swp_entry_t entry; struct page *new; @@ -210,17 +211,18 @@ static bool remove_migration_pte(struct folio *folio, folio_get(folio); pte = mk_pte(new, READ_ONCE(vma->vm_page_prot)); - if (pte_swp_soft_dirty(*pvmw.pte)) + old_pte = ptep_get(pvmw.pte); + if (pte_swp_soft_dirty(old_pte)) pte = pte_mksoft_dirty(pte); - entry = pte_to_swp_entry(*pvmw.pte); + entry = pte_to_swp_entry(old_pte); if (!is_migration_entry_young(entry)) pte = pte_mkold(pte); if (folio_test_dirty(folio) && is_migration_entry_dirty(entry)) pte = pte_mkdirty(pte); if (is_writable_migration_entry(entry)) pte = pte_mkwrite(pte); - else if (pte_swp_uffd_wp(*pvmw.pte)) + else if (pte_swp_uffd_wp(old_pte)) pte = pte_mkuffd_wp(pte); if (folio_test_anon(folio) && !is_readable_migration_entry(entry)) @@ -234,9 +236,9 @@ static bool remove_migration_pte(struct folio *folio, entry = make_readable_device_private_entry( page_to_pfn(new)); pte = swp_entry_to_pte(entry); - if (pte_swp_soft_dirty(*pvmw.pte)) + if (pte_swp_soft_dirty(old_pte)) pte = pte_swp_mksoft_dirty(pte); - if (pte_swp_uffd_wp(*pvmw.pte)) + if (pte_swp_uffd_wp(old_pte)) pte = pte_swp_mkuffd_wp(pte); } @@ -308,7 +310,7 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, if (!ptep) return; - pte = *ptep; + pte = ptep_get(ptep); pte_unmap(ptep); if (!is_swap_pte(pte)) diff --git a/mm/migrate_device.c b/mm/migrate_device.c index a14af6b12b04..02d272b909b5 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -111,7 +111,7 @@ again: swp_entry_t entry; pte_t pte; - pte = *ptep; + pte = ptep_get(ptep); if (pte_none(pte)) { if (vma_is_anonymous(vma)) { @@ -194,7 +194,7 @@ again: bool anon_exclusive; pte_t swp_pte; - flush_cache_page(vma, addr, pte_pfn(*ptep)); + flush_cache_page(vma, addr, pte_pfn(pte)); anon_exclusive = PageAnon(page) && PageAnonExclusive(page); if (anon_exclusive) { pte = ptep_clear_flush(vma, addr, ptep); @@ -573,6 +573,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, pud_t *pudp; pmd_t *pmdp; pte_t *ptep; + pte_t orig_pte; /* Only allow populating anonymous memory */ if (!vma_is_anonymous(vma)) @@ -628,16 +629,18 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); if (!ptep) goto abort; + orig_pte = ptep_get(ptep); + if (check_stable_address_space(mm)) goto unlock_abort; - if (pte_present(*ptep)) { - unsigned long pfn = pte_pfn(*ptep); + if (pte_present(orig_pte)) { + unsigned long pfn = pte_pfn(orig_pte); if (!is_zero_pfn(pfn)) goto unlock_abort; flush = true; - } else if (!pte_none(*ptep)) + } else if (!pte_none(orig_pte)) goto unlock_abort; /* @@ -654,7 +657,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, get_page(page); if (flush) { - flush_cache_page(vma, addr, pte_pfn(*ptep)); + flush_cache_page(vma, addr, pte_pfn(orig_pte)); ptep_clear_flush_notify(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, entry); update_mmu_cache(vma, addr, ptep); diff --git a/mm/mincore.c b/mm/mincore.c index f33f6a0b1ded..b7f7a516b26c 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -119,7 +119,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, return 0; } for (; addr != end; ptep++, addr += PAGE_SIZE) { - pte_t pte = *ptep; + pte_t pte = ptep_get(ptep); /* We need to do cache lookup too for pte markers */ if (pte_none_mostly(pte)) diff --git a/mm/mlock.c b/mm/mlock.c index 9f2b1173b1b1..d7db94519884 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -312,6 +312,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; pte_t *start_pte, *pte; + pte_t ptent; struct folio *folio; ptl = pmd_trans_huge_lock(pmd, vma); @@ -334,9 +335,10 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, return 0; } for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) { - if (!pte_present(*pte)) + ptent = ptep_get(pte); + if (!pte_present(ptent)) continue; - folio = vm_normal_folio(vma, addr, *pte); + folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; if (folio_test_large(folio)) diff --git a/mm/mprotect.c b/mm/mprotect.c index 64e1df0af514..327a6eb90afb 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -105,7 +105,7 @@ static long change_pte_range(struct mmu_gather *tlb, flush_tlb_batched_pending(vma->vm_mm); arch_enter_lazy_mmu_mode(); do { - oldpte = *pte; + oldpte = ptep_get(pte); if (pte_present(oldpte)) { pte_t ptent; @@ -544,7 +544,8 @@ long change_protection(struct mmu_gather *tlb, static int prot_none_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { - return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? + return pfn_modify_allowed(pte_pfn(ptep_get(pte)), + *(pgprot_t *)(walk->private)) ? 0 : -EACCES; } @@ -552,7 +553,8 @@ static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long next, struct mm_walk *walk) { - return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? + return pfn_modify_allowed(pte_pfn(ptep_get(pte)), + *(pgprot_t *)(walk->private)) ? 0 : -EACCES; } diff --git a/mm/mremap.c b/mm/mremap.c index bfc3d1902a94..8ec184ac90ff 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -188,7 +188,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, new_pte++, new_addr += PAGE_SIZE) { - if (pte_none(*old_pte)) + if (pte_none(ptep_get(old_pte))) continue; pte = ptep_get_and_clear(mm, old_addr, old_pte); diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 0c511330dbc9..8f89f9c8f0df 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -190,7 +190,7 @@ void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, if (&init_mm == mm) return; - __page_table_check_pte_clear(mm, addr, *ptep); + __page_table_check_pte_clear(mm, addr, ptep_get(ptep)); if (pte_user_accessible_page(pte)) { page_table_check_set(mm, addr, pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT, @@ -243,7 +243,7 @@ void __page_table_check_pte_clear_range(struct mm_struct *mm, if (WARN_ON(!ptep)) return; for (i = 0; i < PTRS_PER_PTE; i++) { - __page_table_check_pte_clear(mm, addr, *ptep); + __page_table_check_pte_clear(mm, addr, ptep_get(ptep)); addr += PAGE_SIZE; ptep++; } diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 2af734274073..49e0d28f0379 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -15,6 +15,8 @@ static inline bool not_found(struct page_vma_mapped_walk *pvmw) static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) { + pte_t ptent; + if (pvmw->flags & PVMW_SYNC) { /* Use the stricter lookup */ pvmw->pte = pte_offset_map_lock(pvmw->vma->vm_mm, pvmw->pmd, @@ -35,10 +37,12 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) if (!pvmw->pte) return false; + ptent = ptep_get(pvmw->pte); + if (pvmw->flags & PVMW_MIGRATION) { - if (!is_swap_pte(*pvmw->pte)) + if (!is_swap_pte(ptent)) return false; - } else if (is_swap_pte(*pvmw->pte)) { + } else if (is_swap_pte(ptent)) { swp_entry_t entry; /* * Handle un-addressable ZONE_DEVICE memory. @@ -56,11 +60,11 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) * For more details on device private memory see HMM * (include/linux/hmm.h or mm/hmm.c). */ - entry = pte_to_swp_entry(*pvmw->pte); + entry = pte_to_swp_entry(ptent); if (!is_device_private_entry(entry) && !is_device_exclusive_entry(entry)) return false; - } else if (!pte_present(*pvmw->pte)) { + } else if (!pte_present(ptent)) { return false; } pvmw->ptl = *ptlp; @@ -90,33 +94,34 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) static bool check_pte(struct page_vma_mapped_walk *pvmw) { unsigned long pfn; + pte_t ptent = ptep_get(pvmw->pte); if (pvmw->flags & PVMW_MIGRATION) { swp_entry_t entry; - if (!is_swap_pte(*pvmw->pte)) + if (!is_swap_pte(ptent)) return false; - entry = pte_to_swp_entry(*pvmw->pte); + entry = pte_to_swp_entry(ptent); if (!is_migration_entry(entry) && !is_device_exclusive_entry(entry)) return false; pfn = swp_offset_pfn(entry); - } else if (is_swap_pte(*pvmw->pte)) { + } else if (is_swap_pte(ptent)) { swp_entry_t entry; /* Handle un-addressable ZONE_DEVICE memory */ - entry = pte_to_swp_entry(*pvmw->pte); + entry = pte_to_swp_entry(ptent); if (!is_device_private_entry(entry) && !is_device_exclusive_entry(entry)) return false; pfn = swp_offset_pfn(entry); } else { - if (!pte_present(*pvmw->pte)) + if (!pte_present(ptent)) return false; - pfn = pte_pfn(*pvmw->pte); + pfn = pte_pfn(ptent); } return (pfn - pvmw->pfn) < pvmw->nr_pages; @@ -294,7 +299,7 @@ next_pte: goto restart; } pvmw->pte++; - } while (pte_none(*pvmw->pte)); + } while (pte_none(ptep_get(pvmw->pte))); if (!pvmw->ptl) { pvmw->ptl = ptl; diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index c7ab18a5fb77..4d454953046f 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -68,7 +68,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) { - int changed = !pte_same(*ptep, entry); + int changed = !pte_same(ptep_get(ptep), entry); if (changed) { set_pte_at(vma->vm_mm, address, ptep, entry); flush_tlb_fix_spurious_fault(vma, address, ptep); diff --git a/mm/rmap.c b/mm/rmap.c index cd918cb9a431..0c0d8857dfce 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -826,7 +826,8 @@ static bool folio_referenced_one(struct folio *folio, } if (pvmw.pte) { - if (lru_gen_enabled() && pte_young(*pvmw.pte)) { + if (lru_gen_enabled() && + pte_young(ptep_get(pvmw.pte))) { lru_gen_look_around(&pvmw); referenced++; } @@ -956,13 +957,13 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw) address = pvmw->address; if (pvmw->pte) { - pte_t entry; pte_t *pte = pvmw->pte; + pte_t entry = ptep_get(pte); - if (!pte_dirty(*pte) && !pte_write(*pte)) + if (!pte_dirty(entry) && !pte_write(entry)) continue; - flush_cache_page(vma, address, pte_pfn(*pte)); + flush_cache_page(vma, address, pte_pfn(entry)); entry = ptep_clear_flush(vma, address, pte); entry = pte_wrprotect(entry); entry = pte_mkclean(entry); @@ -1137,7 +1138,7 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) * @folio: Folio which contains page. * @page: Page to add to rmap. * @vma: VM area to add page to. - * @address: User virtual address of the mapping + * @address: User virtual address of the mapping * @exclusive: the page is exclusively owned by the current process */ static void __page_set_anon_rmap(struct folio *folio, struct page *page, @@ -1458,6 +1459,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, bool anon_exclusive, ret = true; struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)(long)arg; + unsigned long pfn; /* * When racing against e.g. zap_pte_range() on another cpu, @@ -1508,8 +1510,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, break; } - subpage = folio_page(folio, - pte_pfn(*pvmw.pte) - folio_pfn(folio)); + pfn = pte_pfn(ptep_get(pvmw.pte)); + subpage = folio_page(folio, pfn - folio_pfn(folio)); address = pvmw.address; anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(subpage); @@ -1571,7 +1573,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, } pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); } else { - flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); + flush_cache_page(vma, address, pfn); /* Nuke the page table entry. */ if (should_defer_flush(mm, flags)) { /* @@ -1818,6 +1820,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, bool anon_exclusive, ret = true; struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)(long)arg; + unsigned long pfn; /* * When racing against e.g. zap_pte_range() on another cpu, @@ -1877,6 +1880,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, /* Unexpected PMD-mapped THP? */ VM_BUG_ON_FOLIO(!pvmw.pte, folio); + pfn = pte_pfn(ptep_get(pvmw.pte)); + if (folio_is_zone_device(folio)) { /* * Our PTE is a non-present device exclusive entry and @@ -1891,8 +1896,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, VM_BUG_ON_FOLIO(folio_nr_pages(folio) > 1, folio); subpage = &folio->page; } else { - subpage = folio_page(folio, - pte_pfn(*pvmw.pte) - folio_pfn(folio)); + subpage = folio_page(folio, pfn - folio_pfn(folio)); } address = pvmw.address; anon_exclusive = folio_test_anon(folio) && @@ -1952,7 +1956,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, /* Nuke the hugetlb page table entry */ pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); } else { - flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); + flush_cache_page(vma, address, pfn); /* Nuke the page table entry. */ if (should_defer_flush(mm, flags)) { /* @@ -2187,6 +2191,7 @@ static bool page_make_device_exclusive_one(struct folio *folio, struct mmu_notifier_range range; swp_entry_t entry; pte_t swp_pte; + pte_t ptent; mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma->vm_mm, address, min(vma->vm_end, @@ -2198,18 +2203,19 @@ static bool page_make_device_exclusive_one(struct folio *folio, /* Unexpected PMD-mapped THP? */ VM_BUG_ON_FOLIO(!pvmw.pte, folio); - if (!pte_present(*pvmw.pte)) { + ptent = ptep_get(pvmw.pte); + if (!pte_present(ptent)) { ret = false; page_vma_mapped_walk_done(&pvmw); break; } subpage = folio_page(folio, - pte_pfn(*pvmw.pte) - folio_pfn(folio)); + pte_pfn(ptent) - folio_pfn(folio)); address = pvmw.address; /* Nuke the page table entry. */ - flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); + flush_cache_page(vma, address, pte_pfn(ptent)); pteval = ptep_clear_flush(vma, address, pvmw.pte); /* Set the dirty flag on the folio now the pte is gone. */ diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 10d73a0dfcec..a044a130405b 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -133,7 +133,7 @@ static void * __meminit altmap_alloc_block_buf(unsigned long size, void __meminit vmemmap_verify(pte_t *pte, int node, unsigned long start, unsigned long end) { - unsigned long pfn = pte_pfn(*pte); + unsigned long pfn = pte_pfn(ptep_get(pte)); int actual_node = early_pfn_to_nid(pfn); if (node_distance(actual_node, node) > LOCAL_DISTANCE) @@ -146,7 +146,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, struct page *reuse) { pte_t *pte = pte_offset_kernel(pmd, addr); - if (pte_none(*pte)) { + if (pte_none(ptep_get(pte))) { pte_t entry; void *p; @@ -414,7 +414,7 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, * with just tail struct pages. */ return vmemmap_populate_range(start, end, node, NULL, - pte_page(*pte)); + pte_page(ptep_get(pte))); } size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page)); @@ -438,7 +438,7 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, */ next += PAGE_SIZE; rc = vmemmap_populate_range(next, last, node, NULL, - pte_page(*pte)); + pte_page(ptep_get(pte))); if (rc) return -ENOMEM; } diff --git a/mm/swap_state.c b/mm/swap_state.c index a33c60e0158f..4a5c7b748051 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -275,9 +275,9 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, } } -/* - * If we are the only user, then try to free up the swap cache. - * +/* + * If we are the only user, then try to free up the swap cache. + * * Its ok to check the swapcache flag without the folio lock * here because we are going to recheck again inside * folio_free_swap() _with_ the lock. @@ -294,7 +294,7 @@ void free_swap_cache(struct page *page) } } -/* +/* * Perform a free_page(), also freeing any swap cache associated with * this page if it is the last user of the page. */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 74dd4d2337b7..a6945c2e0d03 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1745,7 +1745,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, struct page *page = folio_file_page(folio, swp_offset(entry)); struct page *swapcache; spinlock_t *ptl; - pte_t *pte, new_pte; + pte_t *pte, new_pte, old_pte; bool hwposioned = false; int ret = 1; @@ -1757,11 +1757,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, hwposioned = true; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - if (unlikely(!pte || !pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) { + if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte), + swp_entry_to_pte(entry)))) { ret = 0; goto out; } + old_pte = ptep_get(pte); + if (unlikely(hwposioned || !PageUptodate(page))) { swp_entry_t swp_entry; @@ -1793,7 +1796,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, * call and have the page locked. */ VM_BUG_ON_PAGE(PageWriteback(page), page); - if (pte_swp_exclusive(*pte)) + if (pte_swp_exclusive(old_pte)) rmap_flags |= RMAP_EXCLUSIVE; page_add_anon_rmap(page, vma, addr, rmap_flags); @@ -1802,9 +1805,9 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, lru_cache_add_inactive_or_unevictable(page, vma); } new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot)); - if (pte_swp_soft_dirty(*pte)) + if (pte_swp_soft_dirty(old_pte)) new_pte = pte_mksoft_dirty(new_pte); - if (pte_swp_uffd_wp(*pte)) + if (pte_swp_uffd_wp(old_pte)) new_pte = pte_mkuffd_wp(new_pte); setpte: set_pte_at(vma->vm_mm, addr, pte, new_pte); @@ -1833,6 +1836,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned char swp_count; swp_entry_t entry; int ret; + pte_t ptent; if (!pte++) { pte = pte_offset_map(pmd, addr); @@ -1840,10 +1844,12 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, break; } - if (!is_swap_pte(*pte)) + ptent = ptep_get_lockless(pte); + + if (!is_swap_pte(ptent)) continue; - entry = pte_to_swp_entry(*pte); + entry = pte_to_swp_entry(ptent); if (swp_type(entry) != type) continue; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 5fd787158c70..a2bf37ee276d 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -97,7 +97,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, * registered, we firstly wr-protect a none pte which has no page cache * page backing it, then access the page. */ - if (!pte_none_mostly(*dst_pte)) + if (!pte_none_mostly(ptep_get(dst_pte))) goto out_unlock; folio = page_folio(page); @@ -230,7 +230,7 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, goto out_unlock; } ret = -EEXIST; - if (!pte_none(*dst_pte)) + if (!pte_none(ptep_get(dst_pte))) goto out_unlock; set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte); /* No need to invalidate - it was non-present before */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7382e0a60ce1..5a3bf408251b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -103,7 +103,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (!pte) return -ENOMEM; do { - BUG_ON(!pte_none(*pte)); + BUG_ON(!pte_none(ptep_get(pte))); #ifdef CONFIG_HUGETLB_PAGE size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift); @@ -472,7 +472,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, do { struct page *page = pages[*nr]; - if (WARN_ON(!pte_none(*pte))) + if (WARN_ON(!pte_none(ptep_get(pte)))) return -EBUSY; if (WARN_ON(!page)) return -ENOMEM; @@ -704,7 +704,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) return NULL; ptep = pte_offset_kernel(pmd, addr); - pte = *ptep; + pte = ptep_get(ptep); if (pte_present(pte)) page = pte_page(pte); diff --git a/mm/vmscan.c b/mm/vmscan.c index 3f64c8d9f629..e305c11ec8fc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4037,15 +4037,16 @@ restart: for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { unsigned long pfn; struct folio *folio; + pte_t ptent = ptep_get(pte + i); total++; walk->mm_stats[MM_LEAF_TOTAL]++; - pfn = get_pte_pfn(pte[i], args->vma, addr); + pfn = get_pte_pfn(ptent, args->vma, addr); if (pfn == -1) continue; - if (!pte_young(pte[i])) { + if (!pte_young(ptent)) { walk->mm_stats[MM_LEAF_OLD]++; continue; } @@ -4060,7 +4061,7 @@ restart: young++; walk->mm_stats[MM_LEAF_YOUNG]++; - if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && + if (pte_dirty(ptent) && !folio_test_dirty(folio) && !(folio_test_anon(folio) && folio_test_swapbacked(folio) && !folio_test_swapcache(folio))) folio_mark_dirty(folio); @@ -4703,12 +4704,13 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { unsigned long pfn; + pte_t ptent = ptep_get(pte + i); - pfn = get_pte_pfn(pte[i], pvmw->vma, addr); + pfn = get_pte_pfn(ptent, pvmw->vma, addr); if (pfn == -1) continue; - if (!pte_young(pte[i])) + if (!pte_young(ptent)) continue; folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap); @@ -4720,7 +4722,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) young++; - if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && + if (pte_dirty(ptent) && !folio_test_dirty(folio) && !(folio_test_anon(folio) && folio_test_swapbacked(folio) && !folio_test_swapcache(folio))) folio_mark_dirty(folio); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 51e4882d0873..fb37adecfc91 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2578,6 +2578,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, { kvm_pfn_t pfn; pte_t *ptep; + pte_t pte; spinlock_t *ptl; int r; @@ -2601,14 +2602,16 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, return r; } - if (write_fault && !pte_write(*ptep)) { + pte = ptep_get(ptep); + + if (write_fault && !pte_write(pte)) { pfn = KVM_PFN_ERR_RO_FAULT; goto out; } if (writable) - *writable = pte_write(*ptep); - pfn = pte_pfn(*ptep); + *writable = pte_write(pte); + pfn = pte_pfn(pte); /* * Get a reference here because callers of *hva_to_pfn* and @@ -2626,7 +2629,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, * tail pages of non-compound higher order allocations, which * would then underflow the refcount when the caller does the * required put_page. Don't allow those pages here. - */ + */ if (!kvm_try_get_pfn(pfn)) r = -EFAULT; -- cgit v1.2.3 From 6c77b607ee26472fb945aa41734281c39d06d68f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 14 Jun 2023 22:36:12 +0800 Subject: mm: kill lock|unlock_page_memcg() Since commit c7c3dec1c9db ("mm: rmap: remove lock_page_memcg()"), no more user, kill lock_page_memcg() and unlock_page_memcg(). Link: https://lkml.kernel.org/r/20230614143612.62575-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Johannes Weiner Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v1/memory.rst | 2 +- include/linux/memcontrol.h | 12 +----------- mm/filemap.c | 2 +- mm/memcontrol.c | 18 ++++-------------- mm/page-writeback.c | 6 +++--- 5 files changed, 10 insertions(+), 30 deletions(-) (limited to 'mm/filemap.c') diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 47d1d7d932a8..fabaad3fd9c2 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -297,7 +297,7 @@ Lock order is as follows:: Page lock (PG_locked bit of page->flags) mm->page_table_lock or split pte_lock - lock_page_memcg (memcg->move_lock) + folio_memcg_lock (memcg->move_lock) mapping->i_pages lock lruvec->lru_lock. diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 00a88cf947e1..c3d3a0c09315 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -419,7 +419,7 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio) * * - the folio lock * - LRU isolation - * - lock_page_memcg() + * - folio_memcg_lock() * - exclusive reference * - mem_cgroup_trylock_pages() * @@ -949,8 +949,6 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); void folio_memcg_lock(struct folio *folio); void folio_memcg_unlock(struct folio *folio); -void lock_page_memcg(struct page *page); -void unlock_page_memcg(struct page *page); void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val); @@ -1438,14 +1436,6 @@ mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) { } -static inline void lock_page_memcg(struct page *page) -{ -} - -static inline void unlock_page_memcg(struct page *page) -{ -} - static inline void folio_memcg_lock(struct folio *folio) { } diff --git a/mm/filemap.c b/mm/filemap.c index 00933089b8b6..758bbdf300e7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -117,7 +117,7 @@ * ->i_pages lock (page_remove_rmap->set_page_dirty) * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) * ->inode->i_lock (page_remove_rmap->set_page_dirty) - * ->memcg->move_lock (page_remove_rmap->lock_page_memcg) + * ->memcg->move_lock (page_remove_rmap->folio_memcg_lock) * bdi.wb->list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->block_dirty_folio) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 93056918e956..cf06b1c9b3bb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2148,17 +2148,12 @@ again: * When charge migration first begins, we can have multiple * critical sections holding the fast-path RCU lock and one * holding the slowpath move_lock. Track the task who has the - * move_lock for unlock_page_memcg(). + * move_lock for folio_memcg_unlock(). */ memcg->move_lock_task = current; memcg->move_lock_flags = flags; } -void lock_page_memcg(struct page *page) -{ - folio_memcg_lock(page_folio(page)); -} - static void __folio_memcg_unlock(struct mem_cgroup *memcg) { if (memcg && memcg->move_lock_task == current) { @@ -2186,11 +2181,6 @@ void folio_memcg_unlock(struct folio *folio) __folio_memcg_unlock(folio_memcg(folio)); } -void unlock_page_memcg(struct page *page) -{ - folio_memcg_unlock(page_folio(page)); -} - struct memcg_stock_pcp { local_lock_t stock_lock; struct mem_cgroup *cached; /* this never be root cgroup */ @@ -2866,7 +2856,7 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) * * - the page lock * - LRU isolation - * - lock_page_memcg() + * - folio_memcg_lock() * - exclusive reference * - mem_cgroup_trylock_pages() */ @@ -5829,7 +5819,7 @@ static int mem_cgroup_move_account(struct page *page, * with (un)charging, migration, LRU putback, or anything else * that would rely on a stable page's memory cgroup. * - * Note that lock_page_memcg is a memcg lock, not a page lock, + * Note that folio_memcg_lock is a memcg lock, not a page lock, * to save space. As soon as we switch page's memory cgroup to a * new memcg that isn't locked, the above state can change * concurrently again. Make sure we're truly done with it. @@ -6320,7 +6310,7 @@ static void mem_cgroup_move_charge(void) { lru_add_drain_all(); /* - * Signal lock_page_memcg() to take the memcg's move_lock + * Signal folio_memcg_lock() to take the memcg's move_lock * while we're moving its pages to another memcg. Then wait * for already started RCU-only updates to finish. */ diff --git a/mm/page-writeback.c b/mm/page-writeback.c index db7943999007..1d17fb1ec863 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2597,7 +2597,7 @@ EXPORT_SYMBOL(noop_dirty_folio); /* * Helper function for set_page_dirty family. * - * Caller must hold lock_page_memcg(). + * Caller must hold folio_memcg_lock(). * * NOTE: This relies on being atomic wrt interrupts. */ @@ -2631,7 +2631,7 @@ static void folio_account_dirtied(struct folio *folio, /* * Helper function for deaccounting dirty page without writeback. * - * Caller must hold lock_page_memcg(). + * Caller must hold folio_memcg_lock(). */ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) { @@ -2650,7 +2650,7 @@ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) * If warn is true, then emit a warning if the folio is not uptodate and has * not been truncated. * - * The caller must hold lock_page_memcg(). Most callers have the folio + * The caller must hold folio_memcg_lock(). Most callers have the folio * locked. A few have the folio blocked from truncation through other * means (eg zap_vma_pages() has it mapped and is holding the page table * lock). This can also be called from mark_buffer_dirty(), which I -- cgit v1.2.3 From 16f8eb3eea9eb2a1568279d64ca4dc977e7aa538 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 21 Jun 2023 14:24:02 -0700 Subject: Revert "page cache: fix page_cache_next/prev_miss off by one" This reverts commit 9425c591e06a9ab27a145ba655fb50532cf0bcc9 The reverted commit fixed up routines primarily used by readahead code such that they could also be used by hugetlb. Unfortunately, this caused a performance regression as pointed out by the Closes: tag. The hugetlb code which uses page_cache_next_miss will be addressed in a subsequent patch. Link: https://lkml.kernel.org/r/20230621212403.174710-1-mike.kravetz@oracle.com Fixes: 9425c591e06a ("page cache: fix page_cache_next/prev_miss off by one") Signed-off-by: Mike Kravetz Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202306211346.1e9ff03e-oliver.sang@intel.com Reviewed-by: Sidhartha Kumar Cc: Ackerley Tng Cc: Erdem Aktas Cc: Greg Kroah-Hartman Cc: Matthew Wilcox Cc: Muchun Song Cc: Vishal Annapurve Signed-off-by: Andrew Morton --- mm/filemap.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index c20e0b1997e8..758bbdf300e7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1728,9 +1728,7 @@ bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm, * * Return: The index of the gap if found, otherwise an index outside the * range specified (in which case 'return - index >= max_scan' will be true). - * In the rare case of index wrap-around, 0 will be returned. 0 will also - * be returned if index == 0 and there is a gap at the index. We can not - * wrap-around if passed index == 0. + * In the rare case of index wrap-around, 0 will be returned. */ pgoff_t page_cache_next_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) @@ -1740,13 +1738,12 @@ pgoff_t page_cache_next_miss(struct address_space *mapping, while (max_scan--) { void *entry = xas_next(&xas); if (!entry || xa_is_value(entry)) - return xas.xa_index; - if (xas.xa_index == 0 && index != 0) - return xas.xa_index; + break; + if (xas.xa_index == 0) + break; } - /* No gaps in range and no wrap-around, return index beyond range */ - return xas.xa_index + 1; + return xas.xa_index; } EXPORT_SYMBOL(page_cache_next_miss); @@ -1767,9 +1764,7 @@ EXPORT_SYMBOL(page_cache_next_miss); * * Return: The index of the gap if found, otherwise an index outside the * range specified (in which case 'index - return >= max_scan' will be true). - * In the rare case of wrap-around, ULONG_MAX will be returned. ULONG_MAX - * will also be returned if index == ULONG_MAX and there is a gap at the - * index. We can not wrap-around if passed index == ULONG_MAX. + * In the rare case of wrap-around, ULONG_MAX will be returned. */ pgoff_t page_cache_prev_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) @@ -1779,13 +1774,12 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, while (max_scan--) { void *entry = xas_prev(&xas); if (!entry || xa_is_value(entry)) - return xas.xa_index; - if (xas.xa_index == ULONG_MAX && index != ULONG_MAX) - return xas.xa_index; + break; + if (xas.xa_index == ULONG_MAX) + break; } - /* No gaps in range and no wrap-around, return index beyond range */ - return xas.xa_index - 1; + return xas.xa_index; } EXPORT_SYMBOL(page_cache_prev_miss); -- cgit v1.2.3