diff options
Diffstat (limited to 'fs')
36 files changed, 306 insertions, 306 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c index cebba4eaa0b5..12c0ae29f185 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -68,6 +68,8 @@ void v9fs_cache_inode_get_cookie(struct inode *inode) &path, sizeof(path), &version, sizeof(version), i_size_read(&v9inode->netfs.inode)); + if (v9inode->netfs.cache) + mapping_set_release_always(inode->i_mapping); p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n", inode, v9fs_inode_cookie(v9inode)); diff --git a/fs/Kconfig b/fs/Kconfig index 7da21f563192..4f8bd14df0df 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -169,6 +169,7 @@ source "fs/sysfs/Kconfig" config TMPFS bool "Tmpfs virtual memory file system support (former shm fs)" depends on SHMEM + select MEMFD_CREATE help Tmpfs is a file system which keeps all files in virtual memory. @@ -252,6 +253,7 @@ config HUGETLBFS bool "HugeTLB file system support" depends on X86 || IA64 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN depends on (SYSFS || SYSCTL) + select MEMFD_CREATE help hugetlbfs is a filesystem backing for HugeTLB pages, based on ramfs. For architectures that support it, say Y here and read @@ -264,7 +266,7 @@ config HUGETLB_PAGE config HUGETLB_PAGE_OPTIMIZE_VMEMMAP def_bool HUGETLB_PAGE - depends on ARCH_WANT_OPTIMIZE_VMEMMAP + depends on ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP depends on SPARSEMEM_VMEMMAP config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON @@ -276,9 +278,6 @@ config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON enable HVO by default. It can be disabled via hugetlb_free_vmemmap=off (boot command line) or hugetlb_optimize_vmemmap (sysctl). -config MEMFD_CREATE - def_bool TMPFS || HUGETLBFS - config ARCH_HAS_GIGANTIC_PAGE bool diff --git a/fs/affs/file.c b/fs/affs/file.c index 472e2bdd5349..04c018e19602 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -523,21 +523,20 @@ affs_getemptyblk_ino(struct inode *inode, int block) return ERR_PTR(err); } -static int -affs_do_readpage_ofs(struct page *page, unsigned to, int create) +static int affs_do_read_folio_ofs(struct folio *folio, size_t to, int create) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct super_block *sb = inode->i_sb; struct buffer_head *bh; - unsigned pos = 0; - u32 bidx, boff, bsize; + size_t pos = 0; + size_t bidx, boff, bsize; u32 tmp; - pr_debug("%s(%lu, %ld, 0, %d)\n", __func__, inode->i_ino, - page->index, to); - BUG_ON(to > PAGE_SIZE); + pr_debug("%s(%lu, %ld, 0, %zu)\n", __func__, inode->i_ino, + folio->index, to); + BUG_ON(to > folio_size(folio)); bsize = AFFS_SB(sb)->s_data_blksize; - tmp = page->index << PAGE_SHIFT; + tmp = folio_pos(folio); bidx = tmp / bsize; boff = tmp % bsize; @@ -547,7 +546,7 @@ affs_do_readpage_ofs(struct page *page, unsigned to, int create) return PTR_ERR(bh); tmp = min(bsize - boff, to - pos); BUG_ON(pos + tmp > to || tmp > bsize); - memcpy_to_page(page, pos, AFFS_DATA(bh) + boff, tmp); + memcpy_to_folio(folio, pos, AFFS_DATA(bh) + boff, tmp); affs_brelse(bh); bidx++; pos += tmp; @@ -627,25 +626,23 @@ out: return PTR_ERR(bh); } -static int -affs_read_folio_ofs(struct file *file, struct folio *folio) +static int affs_read_folio_ofs(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - struct inode *inode = page->mapping->host; - u32 to; + struct inode *inode = folio->mapping->host; + size_t to; int err; - pr_debug("%s(%lu, %ld)\n", __func__, inode->i_ino, page->index); - to = PAGE_SIZE; - if (((page->index + 1) << PAGE_SHIFT) > inode->i_size) { - to = inode->i_size & ~PAGE_MASK; - memset(page_address(page) + to, 0, PAGE_SIZE - to); + pr_debug("%s(%lu, %ld)\n", __func__, inode->i_ino, folio->index); + to = folio_size(folio); + if (folio_pos(folio) + to > inode->i_size) { + to = inode->i_size - folio_pos(folio); + folio_zero_segment(folio, to, folio_size(folio)); } - err = affs_do_readpage_ofs(page, to, 0); + err = affs_do_read_folio_ofs(folio, to, 0); if (!err) - SetPageUptodate(page); - unlock_page(page); + folio_mark_uptodate(folio); + folio_unlock(folio); return err; } @@ -654,7 +651,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; - struct page *page; + struct folio *folio; pgoff_t index; int err = 0; @@ -670,19 +667,20 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping } index = pos >> PAGE_SHIFT; - page = grab_cache_page_write_begin(mapping, index); - if (!page) - return -ENOMEM; - *pagep = page; + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); + *pagep = &folio->page; - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) return 0; /* XXX: inefficient but safe in the face of short writes */ - err = affs_do_readpage_ofs(page, PAGE_SIZE, 1); + err = affs_do_read_folio_ofs(folio, folio_size(folio), 1); if (err) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } return err; } @@ -691,6 +689,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { + struct folio *folio = page_folio(page); struct inode *inode = mapping->host; struct super_block *sb = inode->i_sb; struct buffer_head *bh, *prev_bh; @@ -704,18 +703,18 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, to = from + len; /* * XXX: not sure if this can handle short copies (len < copied), but - * we don't have to, because the page should always be uptodate here, + * we don't have to, because the folio should always be uptodate here, * due to write_begin. */ pr_debug("%s(%lu, %llu, %llu)\n", __func__, inode->i_ino, pos, pos + len); bsize = AFFS_SB(sb)->s_data_blksize; - data = page_address(page); + data = folio_address(folio); bh = NULL; written = 0; - tmp = (page->index << PAGE_SHIFT) + from; + tmp = (folio->index << PAGE_SHIFT) + from; bidx = tmp / bsize; boff = tmp % bsize; if (boff) { @@ -807,11 +806,11 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, from += tmp; bidx++; } - SetPageUptodate(page); + folio_mark_uptodate(folio); done: affs_brelse(bh); - tmp = (page->index << PAGE_SHIFT) + from; + tmp = (folio->index << PAGE_SHIFT) + from; if (tmp > inode->i_size) inode->i_size = AFFS_I(inode)->mmu_private = tmp; @@ -822,8 +821,8 @@ done: } err_first_bh: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return written; diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c index 31d6446dc166..094aec8d17b8 100644 --- a/fs/affs/symlink.c +++ b/fs/affs/symlink.c @@ -13,10 +13,9 @@ static int affs_symlink_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; struct buffer_head *bh; - struct inode *inode = page->mapping->host; - char *link = page_address(page); + struct inode *inode = folio->mapping->host; + char *link = folio_address(folio); struct slink_front *lf; int i, j; char c; @@ -58,12 +57,11 @@ static int affs_symlink_read_folio(struct file *file, struct folio *folio) } link[i] = '\0'; affs_brelse(bh); - SetPageUptodate(page); - unlock_page(page); + folio_mark_uptodate(folio); + folio_unlock(folio); return 0; fail: - SetPageError(page); - unlock_page(page); + folio_unlock(folio); return -EIO; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 9d3d64921106..da73b97e19a9 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -681,6 +681,8 @@ static inline void afs_vnode_set_cache(struct afs_vnode *vnode, { #ifdef CONFIG_AFS_FSCACHE vnode->netfs.cache = cookie; + if (cookie) + mapping_set_release_always(vnode->netfs.inode.i_mapping); #endif } diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index 005751a12911..40f2d9f1a17a 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -8,8 +8,6 @@ #include <linux/math64.h> #include <linux/rbtree.h> -#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) - /* * Enumerate bits using enum autoincrement. Define the @name as the n-th bit. */ diff --git a/fs/buffer.c b/fs/buffer.c index 084a6ade108a..0f17c36922e6 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1539,21 +1539,6 @@ void invalidate_bh_lrus_cpu(void) bh_lru_unlock(); } -void set_bh_page(struct buffer_head *bh, - struct page *page, unsigned long offset) -{ - bh->b_page = page; - BUG_ON(offset >= PAGE_SIZE); - if (PageHighMem(page)) - /* - * This catches illegal uses and preserves the offset: - */ - bh->b_data = (char *)(0 + offset); - else - bh->b_data = page_address(page) + offset; -} -EXPORT_SYMBOL(set_bh_page); - void folio_set_bh(struct buffer_head *bh, struct folio *folio, unsigned long offset) { @@ -2180,8 +2165,7 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len, } EXPORT_SYMBOL(__block_write_begin); -static int __block_commit_write(struct inode *inode, struct folio *folio, - size_t from, size_t to) +static void __block_commit_write(struct folio *folio, size_t from, size_t to) { size_t block_start, block_end; bool partial = false; @@ -2216,7 +2200,6 @@ static int __block_commit_write(struct inode *inode, struct folio *folio, */ if (!partial) folio_mark_uptodate(folio); - return 0; } /* @@ -2253,7 +2236,6 @@ int block_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct folio *folio = page_folio(page); - struct inode *inode = mapping->host; size_t start = pos - folio_pos(folio); if (unlikely(copied < len)) { @@ -2277,7 +2259,7 @@ int block_write_end(struct file *file, struct address_space *mapping, flush_dcache_folio(folio); /* This could be a short (even 0-length) commit */ - __block_commit_write(inode, folio, start, start + copied); + __block_commit_write(folio, start, start + copied); return copied; } @@ -2598,12 +2580,10 @@ int cont_write_begin(struct file *file, struct address_space *mapping, } EXPORT_SYMBOL(cont_write_begin); -int block_commit_write(struct page *page, unsigned from, unsigned to) +void block_commit_write(struct page *page, unsigned from, unsigned to) { struct folio *folio = page_folio(page); - struct inode *inode = folio->mapping->host; - __block_commit_write(inode, folio, from, to); - return 0; + __block_commit_write(folio, from, to); } EXPORT_SYMBOL(block_commit_write); @@ -2649,11 +2629,11 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, end = size - folio_pos(folio); ret = __block_write_begin_int(folio, 0, end, get_block, NULL); - if (!ret) - ret = __block_commit_write(inode, folio, 0, end); - - if (unlikely(ret < 0)) + if (unlikely(ret)) goto out_unlock; + + __block_commit_write(folio, 0, end); + folio_mark_dirty(folio); folio_wait_stable(folio); return 0; diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index d9d22d0ec38a..7bf7a5fcc045 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -585,6 +585,8 @@ static bool cachefiles_open_file(struct cachefiles_object *object, if (ret < 0) goto check_failed; + clear_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &object->cookie->flags); + object->file = file; /* Always update the atime on an object we've just looked up (this is diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 177d8e8d73fe..de1dee46d3df 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -36,6 +36,8 @@ void ceph_fscache_register_inode_cookie(struct inode *inode) &ci->i_vino, sizeof(ci->i_vino), &ci->i_version, sizeof(ci->i_version), i_size_read(inode)); + if (ci->netfs.cache) + mapping_set_release_always(inode->i_mapping); } void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info *ci) @@ -30,17 +30,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/fs_dax.h> -static inline unsigned int pe_order(enum page_entry_size pe_size) -{ - if (pe_size == PE_SIZE_PTE) - return PAGE_SHIFT - PAGE_SHIFT; - if (pe_size == PE_SIZE_PMD) - return PMD_SHIFT - PAGE_SHIFT; - if (pe_size == PE_SIZE_PUD) - return PUD_SHIFT - PAGE_SHIFT; - return ~0; -} - /* We choose 4096 entries - same as per-zone page wait tables */ #define DAX_WAIT_TABLE_BITS 12 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) @@ -49,9 +38,6 @@ static inline unsigned int pe_order(enum page_entry_size pe_size) #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) #define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT) -/* The order of a PMD entry */ -#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) - static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; static int __init init_dax_wait_table(void) @@ -1908,7 +1894,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, /** * dax_iomap_fault - handle a page fault on a DAX file * @vmf: The description of the fault - * @pe_size: Size of the page to fault in + * @order: Order of the page to fault in * @pfnp: PFN to insert for synchronous faults if fsync is required * @iomap_errp: Storage for detailed error code in case of error * @ops: Iomap ops passed from the file system @@ -1918,17 +1904,15 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * has done all the necessary locking for page fault to proceed * successfully. */ -vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, +vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order, pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops) { - switch (pe_size) { - case PE_SIZE_PTE: + if (order == 0) return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops); - case PE_SIZE_PMD: + else if (order == PMD_ORDER) return dax_iomap_pmd_fault(vmf, pfnp, ops); - default: + else return VM_FAULT_FALLBACK; - } } EXPORT_SYMBOL_GPL(dax_iomap_fault); @@ -1979,19 +1963,18 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) /** * dax_finish_sync_fault - finish synchronous page fault * @vmf: The description of the fault - * @pe_size: Size of entry to be inserted + * @order: Order of entry to be inserted * @pfn: PFN to insert * * This function ensures that the file range touched by the page fault is * stored persistently on the media and handles inserting of appropriate page * table entry. */ -vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, - enum page_entry_size pe_size, pfn_t pfn) +vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order, + pfn_t pfn) { int err; loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; - unsigned int order = pe_order(pe_size); size_t len = PAGE_SIZE << order; err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1); diff --git a/fs/drop_caches.c b/fs/drop_caches.c index e619c31b6bd9..b9575957a7c2 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -10,6 +10,7 @@ #include <linux/writeback.h> #include <linux/sysctl.h> #include <linux/gfp.h> +#include <linux/swap.h> #include "internal.h" /* A global variable is a bit ugly, but it keeps the code simple */ @@ -59,6 +60,7 @@ int drop_caches_sysctl_handler(struct ctl_table *table, int write, static int stfu; if (sysctl_drop_caches & 1) { + lru_add_drain_all(); iterate_supers(drop_pagecache_sb, NULL); count_vm_event(DROP_PAGECACHE); } diff --git a/fs/erofs/data.c b/fs/erofs/data.c index db5e4b7636ec..0c2c99c58b5e 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -413,14 +413,14 @@ const struct address_space_operations erofs_raw_access_aops = { #ifdef CONFIG_FS_DAX static vm_fault_t erofs_dax_huge_fault(struct vm_fault *vmf, - enum page_entry_size pe_size) + unsigned int order) { - return dax_iomap_fault(vmf, pe_size, NULL, NULL, &erofs_iomap_ops); + return dax_iomap_fault(vmf, order, NULL, NULL, &erofs_iomap_ops); } static vm_fault_t erofs_dax_fault(struct vm_fault *vmf) { - return erofs_dax_huge_fault(vmf, PE_SIZE_PTE); + return erofs_dax_huge_fault(vmf, 0); } static const struct vm_operations_struct erofs_dax_vm_ops = { diff --git a/fs/exec.c b/fs/exec.c index 1a827d55ba94..0b9484358a49 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -701,6 +701,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) if (vma != vma_next(&vmi)) return -EFAULT; + vma_iter_prev_range(&vmi); /* * cover the whole range: [new_start, old_end) */ diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index eca60b747c6b..c8049c90323d 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -36,8 +36,6 @@ */ -#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) - struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb, unsigned int block_group, struct buffer_head ** bh) diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 0b4c91c62e1f..1039e5bf90af 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -103,7 +103,7 @@ static vm_fault_t ext2_dax_fault(struct vm_fault *vmf) } filemap_invalidate_lock_shared(inode->i_mapping); - ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, NULL, &ext2_iomap_ops); + ret = dax_iomap_fault(vmf, 0, NULL, NULL, &ext2_iomap_ops); filemap_invalidate_unlock_shared(inode->i_mapping); if (write) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1e2259d9967d..481491e892df 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3780,8 +3780,6 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh) set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); } -#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) - /* For ioend & aio unwritten conversion wait queues */ #define EXT4_WQ_HASH_SZ 37 #define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index c457c8517f0f..2dc3f8301225 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -723,8 +723,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) } #ifdef CONFIG_FS_DAX -static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, - enum page_entry_size pe_size) +static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, unsigned int order) { int error = 0; vm_fault_t result; @@ -740,7 +739,7 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, * read-only. * * We check for VM_SHARED rather than vmf->cow_page since the latter is - * unset for pe_size != PE_SIZE_PTE (i.e. only in do_cow_fault); for + * unset for order != 0 (i.e. only in do_cow_fault); for * other sizes, dax_iomap_fault will handle splitting / fallback so that * we eventually come back with a COW page. */ @@ -764,7 +763,7 @@ retry: } else { filemap_invalidate_lock_shared(mapping); } - result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops); + result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops); if (write) { ext4_journal_stop(handle); @@ -773,7 +772,7 @@ retry: goto retry; /* Handling synchronous page fault? */ if (result & VM_FAULT_NEEDDSYNC) - result = dax_finish_sync_fault(vmf, pe_size, pfn); + result = dax_finish_sync_fault(vmf, order, pfn); filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(sb); } else { @@ -785,7 +784,7 @@ retry: static vm_fault_t ext4_dax_fault(struct vm_fault *vmf) { - return ext4_dax_huge_fault(vmf, PE_SIZE_PTE); + return ext4_dax_huge_fault(vmf, 0); } static const struct vm_operations_struct ext4_dax_vm_ops = { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6683076ecb2f..d3f581ced672 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1569,7 +1569,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, if (folio->index < mpd->first_page) continue; - if (folio->index + folio_nr_pages(folio) - 1 > end) + if (folio_next_index(folio) - 1 > end) continue; BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); @@ -2455,7 +2455,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) if (mpd->map.m_len == 0) mpd->first_page = folio->index; - mpd->next_page = folio->index + folio_nr_pages(folio); + mpd->next_page = folio_next_index(folio); /* * Writeout when we cannot modify metadata is simple. * Just submit the page. For data=journal mode we diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index b5af2fc03b2f..18a9e7c47975 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -340,10 +340,8 @@ again: ext4_double_up_write_data_sem(orig_inode, donor_inode); goto data_copy; } - if ((folio_has_private(folio[0]) && - !filemap_release_folio(folio[0], 0)) || - (folio_has_private(folio[1]) && - !filemap_release_folio(folio[1], 0))) { + if (!filemap_release_folio(folio[0], 0) || + !filemap_release_folio(folio[1], 0)) { *err = -EBUSY; goto drop_data_sem; } @@ -362,10 +360,8 @@ data_copy: /* At this point all buffers in range are uptodate, old mapping layout * is no longer required, try to drop it now. */ - if ((folio_has_private(folio[0]) && - !filemap_release_folio(folio[0], 0)) || - (folio_has_private(folio[1]) && - !filemap_release_folio(folio[1], 0))) { + if (!filemap_release_folio(folio[0], 0) || + !filemap_release_folio(folio[1], 0)) { *err = -EBUSY; goto unlock_folios; } @@ -392,14 +388,11 @@ data_copy: for (i = 0; i < block_len_in_page; i++) { *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0); if (*err < 0) - break; + goto repair_branches; bh = bh->b_this_page; } - if (!*err) - *err = block_commit_write(&folio[0]->page, from, from + replaced_size); - if (unlikely(*err < 0)) - goto repair_branches; + block_commit_write(&folio[0]->page, from, from + replaced_size); /* Even in case of data=writeback it is reasonable to pin * inode to transaction, to prevent unexpected data loss */ diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 8e74f278a3f6..23904a6a9a96 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -784,8 +784,8 @@ static int fuse_dax_writepages(struct address_space *mapping, return dax_writeback_mapping_range(mapping, fc->dax->dev, wbc); } -static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, - enum page_entry_size pe_size, bool write) +static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, unsigned int order, + bool write) { vm_fault_t ret; struct inode *inode = file_inode(vmf->vma->vm_file); @@ -809,7 +809,7 @@ retry: * to populate page cache or access memory we are trying to free. */ filemap_invalidate_lock_shared(inode->i_mapping); - ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops); + ret = dax_iomap_fault(vmf, order, &pfn, &error, &fuse_iomap_ops); if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) { error = 0; retry = true; @@ -818,7 +818,7 @@ retry: } if (ret & VM_FAULT_NEEDDSYNC) - ret = dax_finish_sync_fault(vmf, pe_size, pfn); + ret = dax_finish_sync_fault(vmf, order, pfn); filemap_invalidate_unlock_shared(inode->i_mapping); if (write) @@ -829,24 +829,22 @@ retry: static vm_fault_t fuse_dax_fault(struct vm_fault *vmf) { - return __fuse_dax_fault(vmf, PE_SIZE_PTE, - vmf->flags & FAULT_FLAG_WRITE); + return __fuse_dax_fault(vmf, 0, vmf->flags & FAULT_FLAG_WRITE); } -static vm_fault_t fuse_dax_huge_fault(struct vm_fault *vmf, - enum page_entry_size pe_size) +static vm_fault_t fuse_dax_huge_fault(struct vm_fault *vmf, unsigned int order) { - return __fuse_dax_fault(vmf, pe_size, vmf->flags & FAULT_FLAG_WRITE); + return __fuse_dax_fault(vmf, order, vmf->flags & FAULT_FLAG_WRITE); } static vm_fault_t fuse_dax_page_mkwrite(struct vm_fault *vmf) { - return __fuse_dax_fault(vmf, PE_SIZE_PTE, true); + return __fuse_dax_fault(vmf, 0, true); } static vm_fault_t fuse_dax_pfn_mkwrite(struct vm_fault *vmf) { - return __fuse_dax_fault(vmf, PE_SIZE_PTE, true); + return __fuse_dax_fault(vmf, 0, true); } static const struct vm_operations_struct fuse_dax_vm_ops = { diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 93d3bcfd4fc8..316c4cebd3f3 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -283,6 +283,41 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, #endif /* + * Someone wants to read @bytes from a HWPOISON hugetlb @page from @offset. + * Returns the maximum number of bytes one can read without touching the 1st raw + * HWPOISON subpage. + * + * The implementation borrows the iteration logic from copy_page_to_iter*. + */ +static size_t adjust_range_hwpoison(struct page *page, size_t offset, size_t bytes) +{ + size_t n = 0; + size_t res = 0; + + /* First subpage to start the loop. */ + page += offset / PAGE_SIZE; + offset %= PAGE_SIZE; + while (1) { + if (is_raw_hwpoison_page_in_hugepage(page)) + break; + + /* Safe to read n bytes without touching HWPOISON subpage. */ + n = min(bytes, (size_t)PAGE_SIZE - offset); + res += n; + bytes -= n; + if (!bytes || !n) + break; + offset += n; + if (offset == PAGE_SIZE) { + page++; + offset = 0; + } + } + + return res; +} + +/* * Support for read() - Find the page attached to f_mapping and copy out the * data. This provides functionality similar to filemap_read(). */ @@ -300,7 +335,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) while (iov_iter_count(to)) { struct page *page; - size_t nr, copied; + size_t nr, copied, want; /* nr is the maximum number of bytes to copy from this page */ nr = huge_page_size(h); @@ -328,16 +363,26 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) } else { unlock_page(page); - if (PageHWPoison(page)) { - put_page(page); - retval = -EIO; - break; + if (!PageHWPoison(page)) + want = nr; + else { + /* + * Adjust how many bytes safe to read without + * touching the 1st raw HWPOISON subpage after + * offset. + */ + want = adjust_range_hwpoison(page, offset, nr); + if (want == 0) { + put_page(page); + retval = -EIO; + break; + } } /* * We have the page, copy it to user space buffer. */ - copied = copy_page_to_iter(page, offset, nr, to); + copied = copy_page_to_iter(page, offset, want, to); put_page(page); } offset += copied; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index fbce16fedaa4..1b5a45ab62b0 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -341,7 +341,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, int do_escape = 0; char *mapped_data; struct buffer_head *new_bh; - struct page *new_page; + struct folio *new_folio; unsigned int new_offset; struct buffer_head *bh_in = jh2bh(jh_in); journal_t *journal = transaction->t_journal; @@ -370,14 +370,14 @@ repeat: */ if (jh_in->b_frozen_data) { done_copy_out = 1; - new_page = virt_to_page(jh_in->b_frozen_data); - new_offset = offset_in_page(jh_in->b_frozen_data); + new_folio = virt_to_folio(jh_in->b_frozen_data); + new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data); } else { - new_page = jh2bh(jh_in)->b_page; - new_offset = offset_in_page(jh2bh(jh_in)->b_data); + new_folio = jh2bh(jh_in)->b_folio; + new_offset = offset_in_folio(new_folio, jh2bh(jh_in)->b_data); } - mapped_data = kmap_atomic(new_page); + mapped_data = kmap_local_folio(new_folio, new_offset); /* * Fire data frozen trigger if data already wasn't frozen. Do this * before checking for escaping, as the trigger may modify the magic @@ -385,18 +385,17 @@ repeat: * data in the buffer. */ if (!done_copy_out) - jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset, + jbd2_buffer_frozen_trigger(jh_in, mapped_data, jh_in->b_triggers); /* * Check for escaping */ - if (*((__be32 *)(mapped_data + new_offset)) == - cpu_to_be32(JBD2_MAGIC_NUMBER)) { + if (*((__be32 *)mapped_data) == cpu_to_be32(JBD2_MAGIC_NUMBER)) { need_copy_out = 1; do_escape = 1; } - kunmap_atomic(mapped_data); + kunmap_local(mapped_data); /* * Do we need to do a data copy? @@ -417,12 +416,10 @@ repeat: } jh_in->b_frozen_data = tmp; - mapped_data = kmap_atomic(new_page); - memcpy(tmp, mapped_data + new_offset, bh_in->b_size); - kunmap_atomic(mapped_data); + memcpy_from_folio(tmp, new_folio, new_offset, bh_in->b_size); - new_page = virt_to_page(tmp); - new_offset = offset_in_page(tmp); + new_folio = virt_to_folio(tmp); + new_offset = offset_in_folio(new_folio, tmp); done_copy_out = 1; /* @@ -438,12 +435,12 @@ repeat: * copying, we can finally do so. */ if (do_escape) { - mapped_data = kmap_atomic(new_page); - *((unsigned int *)(mapped_data + new_offset)) = 0; - kunmap_atomic(mapped_data); + mapped_data = kmap_local_folio(new_folio, new_offset); + *((unsigned int *)mapped_data) = 0; + kunmap_local(mapped_data); } - set_bh_page(new_bh, new_page, new_offset); + folio_set_bh(new_bh, new_folio, new_offset); new_bh->b_size = bh_in->b_size; new_bh->b_bdev = journal->j_dev; new_bh->b_blocknr = blocknr; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 8c35d88a84b1..b05717fe0d4e 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -180,6 +180,9 @@ void nfs_fscache_init_inode(struct inode *inode) &auxdata, /* aux_data */ sizeof(auxdata), i_size_read(inode)); + + if (netfs_inode(inode)->cache) + mapping_set_release_always(inode->i_mapping); } /* diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 4123e126c4d0..eb2ed0701495 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -556,7 +556,7 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, struct super_block *sb = inode->i_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; struct ntfs_inode *ni = ntfs_i(inode); - struct page *page = bh->b_page; + struct folio *folio = bh->b_folio; u8 cluster_bits = sbi->cluster_bits; u32 block_size = sb->s_blocksize; u64 bytes, lbo, valid; @@ -571,7 +571,7 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, if (is_resident(ni)) { ni_lock(ni); - err = attr_data_read_resident(ni, page); + err = attr_data_read_resident(ni, &folio->page); ni_unlock(ni); if (!err) @@ -644,17 +644,17 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, */ bytes = block_size; - if (page) { + if (folio) { u32 voff = valid - vbo; bh->b_size = block_size; off = vbo & (PAGE_SIZE - 1); - set_bh_page(bh, page, off); + folio_set_bh(bh, folio, off); err = bh_read(bh, 0); if (err < 0) goto out; - zero_user_segment(page, off + voff, off + block_size); + folio_zero_segment(folio, off + voff, off + block_size); } } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 3b91b4cc7c6a..c45596c25c66 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -810,12 +810,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, /* must not update i_size! */ - ret = block_commit_write(page, block_start + 1, - block_start + 1); - if (ret < 0) - mlog_errno(ret); - else - ret = 0; + block_commit_write(page, block_start + 1, block_start + 1); } /* diff --git a/fs/proc/base.c b/fs/proc/base.c index 7576effe8d52..02e1cf03d94b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3207,6 +3207,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, mm = get_task_mm(task); if (mm) { seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items); + seq_printf(m, "ksm_zero_pages %lu\n", mm->ksm_zero_pages); seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages); seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm)); mmput(mm); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 8dca4d6d96c7..45af9a989d40 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -17,6 +17,7 @@ #ifdef CONFIG_CMA #include <linux/cma.h> #endif +#include <linux/zswap.h> #include <asm/page.h> #include "internal.h" @@ -132,17 +133,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "VmallocChunk: ", 0ul); show_val_kb(m, "Percpu: ", pcpu_nr_pages()); -#ifdef CONFIG_MEMTEST - if (early_memtest_done) { - unsigned long early_memtest_bad_size_kb; - - early_memtest_bad_size_kb = early_memtest_bad_size>>10; - if (early_memtest_bad_size && !early_memtest_bad_size_kb) - early_memtest_bad_size_kb = 1; - /* When 0 is reported, it means there actually was a successful test */ - seq_printf(m, "EarlyMemtestBad: %5lu kB\n", early_memtest_bad_size_kb); - } -#endif + memtest_report_meminfo(m); #ifdef CONFIG_MEMORY_FAILURE seq_printf(m, "HardwareCorrupted: %5lu kB\n", diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index fafff1bd34cd..15ddf4653a19 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -236,21 +236,6 @@ static int do_maps_open(struct inode *inode, struct file *file, sizeof(struct proc_maps_private)); } -/* - * Indicate if the VMA is a stack for the given task; for - * /proc/PID/maps that is the stack of the main task. - */ -static int is_stack(struct vm_area_struct *vma) -{ - /* - * We make no effort to guess what a given thread considers to be - * its "stack". It's not even well-defined for programs written - * languages like Go. - */ - return vma->vm_start <= vma->vm_mm->start_stack && - vma->vm_end >= vma->vm_mm->start_stack; -} - static void show_vma_header_prefix(struct seq_file *m, unsigned long start, unsigned long end, vm_flags_t flags, unsigned long long pgoff, @@ -327,13 +312,12 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) goto done; } - if (vma->vm_start <= mm->brk && - vma->vm_end >= mm->start_brk) { + if (vma_is_initial_heap(vma)) { name = "[heap]"; goto done; } - if (is_stack(vma)) { + if (vma_is_initial_stack(vma)) { name = "[stack]"; goto done; } @@ -871,7 +855,7 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); - seq_printf(m, "THPeligible: %d\n", + seq_printf(m, "THPeligible: %8u\n", hugepage_vma_check(vma, vma->vm_flags, true, false, true)); if (arch_pkeys_enabled()) @@ -1975,9 +1959,9 @@ static int show_numa_map(struct seq_file *m, void *v) if (file) { seq_puts(m, " file="); seq_file_path(m, file, "\n\t= "); - } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { + } else if (vma_is_initial_heap(vma)) { seq_puts(m, " heap"); - } else if (is_stack(vma)) { + } else if (vma_is_initial_stack(vma)) { seq_puts(m, " stack"); } diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 2c8b62265981..a8ac0dd8041e 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -121,19 +121,6 @@ unsigned long task_statm(struct mm_struct *mm, return size; } -static int is_stack(struct vm_area_struct *vma) -{ - struct mm_struct *mm = vma->vm_mm; - - /* - * We make no effort to guess what a given thread considers to be - * its "stack". It's not even well-defined for programs written - * languages like Go. - */ - return vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack; -} - /* * display a single VMA to a sequenced file */ @@ -171,7 +158,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) if (file) { seq_pad(m, ' '); seq_file_path(m, file, ""); - } else if (mm && is_stack(vma)) { + } else if (mm && vma_is_initial_stack(vma)) { seq_pad(m, ' '); seq_puts(m, "[stack]"); } diff --git a/fs/smb/client/fscache.c b/fs/smb/client/fscache.c index 8f6909d633da..3677525ee993 100644 --- a/fs/smb/client/fscache.c +++ b/fs/smb/client/fscache.c @@ -108,6 +108,8 @@ void cifs_fscache_get_inode_cookie(struct inode *inode) &cifsi->uniqueid, sizeof(cifsi->uniqueid), &cd, sizeof(cd), i_size_read(&cifsi->netfs.inode)); + if (cifsi->netfs.cache) + mapping_set_release_always(inode->i_mapping); } void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update) diff --git a/fs/splice.c b/fs/splice.c index 02631013b09f..d983d375ff11 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -83,8 +83,7 @@ static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe, */ folio_wait_writeback(folio); - if (folio_has_private(folio) && - !filemap_release_folio(folio, GFP_KERNEL)) + if (!filemap_release_folio(folio, GFP_KERNEL)) goto out_unlock; /* diff --git a/fs/udf/file.c b/fs/udf/file.c index 243840dc83ad..0292d75e60cc 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -63,13 +63,13 @@ static vm_fault_t udf_page_mkwrite(struct vm_fault *vmf) else end = PAGE_SIZE; err = __block_write_begin(page, 0, end, udf_get_block); - if (!err) - err = block_commit_write(page, 0, end); - if (err < 0) { + if (err) { unlock_page(page); ret = block_page_mkwrite_return(err); goto out_unlock; } + + block_commit_write(page, 0, end); out_dirty: set_page_dirty(page); wait_for_stable_page(page); diff --git a/fs/ufs/util.h b/fs/ufs/util.h index 4931bec1a01c..89247193d96d 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -11,12 +11,6 @@ #include <linux/fs.h> #include "swab.h" - -/* - * some useful macros - */ -#define in_range(b,first,len) ((b)>=(first)&&(b)<(first)+(len)) - /* * functions used for retyping */ diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 7cecd49e078b..56eaae9dac1a 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -277,17 +277,16 @@ static inline struct uffd_msg userfault_msg(unsigned long address, * hugepmd ranges. */ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, - struct vm_area_struct *vma, - unsigned long address, - unsigned long flags, - unsigned long reason) + struct vm_fault *vmf, + unsigned long reason) { + struct vm_area_struct *vma = vmf->vma; pte_t *ptep, pte; bool ret = true; - mmap_assert_locked(ctx->mm); + assert_fault_locked(vmf); - ptep = hugetlb_walk(vma, address, vma_mmu_pagesize(vma)); + ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma)); if (!ptep) goto out; @@ -308,10 +307,8 @@ out: } #else static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, - struct vm_area_struct *vma, - unsigned long address, - unsigned long flags, - unsigned long reason) + struct vm_fault *vmf, + unsigned long reason) { return false; /* should never get here */ } @@ -325,11 +322,11 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, * threads. */ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, - unsigned long address, - unsigned long flags, + struct vm_fault *vmf, unsigned long reason) { struct mm_struct *mm = ctx->mm; + unsigned long address = vmf->address; pgd_t *pgd; p4d_t *p4d; pud_t *pud; @@ -338,7 +335,7 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, pte_t ptent; bool ret = true; - mmap_assert_locked(mm); + assert_fault_locked(vmf); pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) @@ -427,20 +424,16 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) * * We also don't do userfault handling during * coredumping. hugetlbfs has the special - * follow_hugetlb_page() to skip missing pages in the + * hugetlb_follow_page_mask() to skip missing pages in the * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with * the no_page_table() helper in follow_page_mask(), but the * shmem_vm_ops->fault method is invoked even during - * coredumping without mmap_lock and it ends up here. + * coredumping and it ends up here. */ if (current->flags & (PF_EXITING|PF_DUMPCORE)) goto out; - /* - * Coredumping runs without mmap_lock so we can only check that - * the mmap_lock is held, if PF_DUMPCORE was not set. - */ - mmap_assert_locked(mm); + assert_fault_locked(vmf); ctx = vma->vm_userfaultfd_ctx.ctx; if (!ctx) @@ -556,15 +549,12 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) spin_unlock_irq(&ctx->fault_pending_wqh.lock); if (!is_vm_hugetlb_page(vma)) - must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, - reason); + must_wait = userfaultfd_must_wait(ctx, vmf, reason); else - must_wait = userfaultfd_huge_must_wait(ctx, vma, - vmf->address, - vmf->flags, reason); + must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason); if (is_vm_hugetlb_page(vma)) hugetlb_vma_unlock_read(vma); - mmap_read_unlock(mm); + release_fault_lock(vmf); if (likely(must_wait && !READ_ONCE(ctx->released))) { wake_up_poll(&ctx->fd_wqh, EPOLLIN); @@ -667,6 +657,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, mmap_write_lock(mm); for_each_vma(vmi, vma) { if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { + vma_start_write(vma); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); @@ -702,6 +693,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) octx = vma->vm_userfaultfd_ctx.ctx; if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) { + vma_start_write(vma); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); return 0; @@ -783,6 +775,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, atomic_inc(&ctx->mmap_changing); } else { /* Drop uffd context if remap feature not enabled */ + vma_start_write(vma); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); } @@ -940,6 +933,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) prev = vma; } + vma_start_write(vma); userfaultfd_set_vm_flags(vma, new_flags); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } @@ -1289,13 +1283,11 @@ static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, __wake_userfault(ctx, range); } -static __always_inline int validate_range(struct mm_struct *mm, - __u64 start, __u64 len) +static __always_inline int validate_unaligned_range( + struct mm_struct *mm, __u64 start, __u64 len) { __u64 task_size = mm->task_size; - if (start & ~PAGE_MASK) - return -EINVAL; if (len & ~PAGE_MASK) return -EINVAL; if (!len) @@ -1306,9 +1298,20 @@ static __always_inline int validate_range(struct mm_struct *mm, return -EINVAL; if (len > task_size - start) return -EINVAL; + if (start + len <= start) + return -EINVAL; return 0; } +static __always_inline int validate_range(struct mm_struct *mm, + __u64 start, __u64 len) +{ + if (start & ~PAGE_MASK) + return -EINVAL; + + return validate_unaligned_range(mm, start, len); +} + static int userfaultfd_register(struct userfaultfd_ctx *ctx, unsigned long arg) { @@ -1502,6 +1505,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, * the next vma was merged into the current one and * the current one has not been updated yet. */ + vma_start_write(vma); userfaultfd_set_vm_flags(vma, new_flags); vma->vm_userfaultfd_ctx.ctx = ctx; @@ -1685,6 +1689,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * the next vma was merged into the current one and * the current one has not been updated yet. */ + vma_start_write(vma); userfaultfd_set_vm_flags(vma, new_flags); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; @@ -1757,17 +1762,15 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, sizeof(uffdio_copy)-sizeof(__s64))) goto out; + ret = validate_unaligned_range(ctx->mm, uffdio_copy.src, + uffdio_copy.len); + if (ret) + goto out; ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len); if (ret) goto out; - /* - * double check for wraparound just in case. copy_from_user() - * will later check uffdio_copy.src + uffdio_copy.len to fit - * in the userland range. - */ + ret = -EINVAL; - if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src) - goto out; if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) goto out; if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP) @@ -1927,11 +1930,6 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) goto out; ret = -EINVAL; - /* double check for wraparound just in case. */ - if (uffdio_continue.range.start + uffdio_continue.range.len <= - uffdio_continue.range.start) { - goto out; - } if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE | UFFDIO_CONTINUE_MODE_WP)) goto out; @@ -1965,6 +1963,61 @@ out: return ret; } +static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg) +{ + __s64 ret; + struct uffdio_poison uffdio_poison; + struct uffdio_poison __user *user_uffdio_poison; + struct userfaultfd_wake_range range; + + user_uffdio_poison = (struct uffdio_poison __user *)arg; + + ret = -EAGAIN; + if (atomic_read(&ctx->mmap_changing)) + goto out; + + ret = -EFAULT; + if (copy_from_user(&uffdio_poison, user_uffdio_poison, + /* don't copy the output fields */ + sizeof(uffdio_poison) - (sizeof(__s64)))) + goto out; + + ret = validate_range(ctx->mm, uffdio_poison.range.start, + uffdio_poison.range.len); + if (ret) + goto out; + + ret = -EINVAL; + if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE) + goto out; + + if (mmget_not_zero(ctx->mm)) { + ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start, + uffdio_poison.range.len, + &ctx->mmap_changing, 0); + mmput(ctx->mm); + } else { + return -ESRCH; + } + + if (unlikely(put_user(ret, &user_uffdio_poison->updated))) + return -EFAULT; + if (ret < 0) + goto out; + + /* len == 0 would wake all */ + BUG_ON(!ret); + range.len = ret; + if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) { + range.start = uffdio_poison.range.start; + wake_userfault(ctx, &range); + } + ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN; + +out: + return ret; +} + static inline unsigned int uffd_ctx_features(__u64 user_features) { /* @@ -2066,6 +2119,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd, case UFFDIO_CONTINUE: ret = userfaultfd_continue(ctx, arg); break; + case UFFDIO_POISON: + ret = userfaultfd_poison(ctx, arg); + break; } return ret; } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 4f502219ae4f..203700278ddb 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1287,11 +1287,11 @@ xfs_file_llseek( static inline vm_fault_t xfs_dax_fault( struct vm_fault *vmf, - enum page_entry_size pe_size, + unsigned int order, bool write_fault, pfn_t *pfn) { - return dax_iomap_fault(vmf, pe_size, pfn, NULL, + return dax_iomap_fault(vmf, order, pfn, NULL, (write_fault && !vmf->cow_page) ? &xfs_dax_write_iomap_ops : &xfs_read_iomap_ops); @@ -1300,7 +1300,7 @@ xfs_dax_fault( static inline vm_fault_t xfs_dax_fault( struct vm_fault *vmf, - enum page_entry_size pe_size, + unsigned int order, bool write_fault, pfn_t *pfn) { @@ -1322,14 +1322,14 @@ xfs_dax_fault( static vm_fault_t __xfs_filemap_fault( struct vm_fault *vmf, - enum page_entry_size pe_size, + unsigned int order, bool write_fault) { struct inode *inode = file_inode(vmf->vma->vm_file); struct xfs_inode *ip = XFS_I(inode); vm_fault_t ret; - trace_xfs_filemap_fault(ip, pe_size, write_fault); + trace_xfs_filemap_fault(ip, order, write_fault); if (write_fault) { sb_start_pagefault(inode->i_sb); @@ -1340,9 +1340,9 @@ __xfs_filemap_fault( pfn_t pfn; xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = xfs_dax_fault(vmf, pe_size, write_fault, &pfn); + ret = xfs_dax_fault(vmf, order, write_fault, &pfn); if (ret & VM_FAULT_NEEDDSYNC) - ret = dax_finish_sync_fault(vmf, pe_size, pfn); + ret = dax_finish_sync_fault(vmf, order, pfn); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); } else { if (write_fault) { @@ -1373,7 +1373,7 @@ xfs_filemap_fault( struct vm_fault *vmf) { /* DAX can shortcut the normal fault path on write faults! */ - return __xfs_filemap_fault(vmf, PE_SIZE_PTE, + return __xfs_filemap_fault(vmf, 0, IS_DAX(file_inode(vmf->vma->vm_file)) && xfs_is_write_fault(vmf)); } @@ -1381,13 +1381,13 @@ xfs_filemap_fault( static vm_fault_t xfs_filemap_huge_fault( struct vm_fault *vmf, - enum page_entry_size pe_size) + unsigned int order) { if (!IS_DAX(file_inode(vmf->vma->vm_file))) return VM_FAULT_FALLBACK; /* DAX can shortcut the normal fault path on write faults! */ - return __xfs_filemap_fault(vmf, pe_size, + return __xfs_filemap_fault(vmf, order, xfs_is_write_fault(vmf)); } @@ -1395,7 +1395,7 @@ static vm_fault_t xfs_filemap_page_mkwrite( struct vm_fault *vmf) { - return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true); + return __xfs_filemap_fault(vmf, 0, true); } /* @@ -1408,7 +1408,7 @@ xfs_filemap_pfn_mkwrite( struct vm_fault *vmf) { - return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true); + return __xfs_filemap_fault(vmf, 0, true); } static const struct vm_operations_struct xfs_file_vm_ops = { diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index f3cc204bb4bf..fd789e00dfd6 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -802,36 +802,28 @@ DEFINE_INODE_EVENT(xfs_inode_inactivating); * ring buffer. Somehow this was only worth mentioning in the ftrace sample * code. */ -TRACE_DEFINE_ENUM(PE_SIZE_PTE); -TRACE_DEFINE_ENUM(PE_SIZE_PMD); -TRACE_DEFINE_ENUM(PE_SIZE_PUD); - TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED); TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW); TRACE_EVENT(xfs_filemap_fault, - TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size, - bool write_fault), - TP_ARGS(ip, pe_size, write_fault), + TP_PROTO(struct xfs_inode *ip, unsigned int order, bool write_fault), + TP_ARGS(ip, order, write_fault), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) - __field(enum page_entry_size, pe_size) + __field(unsigned int, order) __field(bool, write_fault) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; - __entry->pe_size = pe_size; + __entry->order = order; __entry->write_fault = write_fault; ), - TP_printk("dev %d:%d ino 0x%llx %s write_fault %d", + TP_printk("dev %d:%d ino 0x%llx order %u write_fault %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, - __print_symbolic(__entry->pe_size, - { PE_SIZE_PTE, "PTE" }, - { PE_SIZE_PMD, "PMD" }, - { PE_SIZE_PUD, "PUD" }), + __entry->order, __entry->write_fault) ) |