diff options
-rw-r--r-- | .mailmap | 2 | ||||
-rw-r--r-- | Documentation/mm/zsmalloc.rst | 135 | ||||
-rw-r--r-- | fs/dax.c | 37 | ||||
-rw-r--r-- | fs/fs-writeback.c | 17 | ||||
-rw-r--r-- | fs/nilfs2/btree.c | 1 | ||||
-rw-r--r-- | fs/nilfs2/direct.c | 1 | ||||
-rw-r--r-- | fs/nilfs2/segment.c | 3 | ||||
-rw-r--r-- | fs/nilfs2/super.c | 2 | ||||
-rw-r--r-- | fs/nilfs2/the_nilfs.c | 12 | ||||
-rw-r--r-- | fs/userfaultfd.c | 6 | ||||
-rw-r--r-- | include/linux/mm_types.h | 3 | ||||
-rw-r--r-- | kernel/fork.c | 3 | ||||
-rw-r--r-- | lib/maple_tree.c | 306 | ||||
-rw-r--r-- | mm/backing-dev.c | 12 | ||||
-rw-r--r-- | mm/huge_memory.c | 19 | ||||
-rw-r--r-- | mm/hugetlb.c | 14 | ||||
-rw-r--r-- | mm/khugepaged.c | 4 | ||||
-rw-r--r-- | mm/memory.c | 16 | ||||
-rw-r--r-- | mm/mempolicy.c | 104 | ||||
-rw-r--r-- | mm/mmap.c | 3 | ||||
-rw-r--r-- | mm/mprotect.c | 2 | ||||
-rw-r--r-- | mm/swap.c | 2 | ||||
-rw-r--r-- | mm/swapfile.c | 3 | ||||
-rw-r--r-- | mm/vmalloc.c | 8 | ||||
-rw-r--r-- | tools/mm/page_owner_sort.c | 2 | ||||
-rw-r--r-- | tools/testing/radix-tree/maple.c | 16 |
26 files changed, 476 insertions, 257 deletions
@@ -232,6 +232,8 @@ Johan Hovold <johan@kernel.org> <johan@hovoldconsulting.com> John Crispin <john@phrozen.org> <blogic@openwrt.org> John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de> John Stultz <johnstul@us.ibm.com> +<jon.toppins+linux@gmail.com> <jtoppins@cumulusnetworks.com> +<jon.toppins+linux@gmail.com> <jtoppins@redhat.com> Jordan Crouse <jordan@cosmicpenguin.net> <jcrouse@codeaurora.org> <josh@joshtriplett.org> <josh@freedesktop.org> <josh@joshtriplett.org> <josh@kernel.org> diff --git a/Documentation/mm/zsmalloc.rst b/Documentation/mm/zsmalloc.rst index 64d127bfc221..a3c26d587752 100644 --- a/Documentation/mm/zsmalloc.rst +++ b/Documentation/mm/zsmalloc.rst @@ -39,13 +39,12 @@ With CONFIG_ZSMALLOC_STAT, we could see zsmalloc internal information via # cat /sys/kernel/debug/zsmalloc/zram0/classes - class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage + class size 10% 20% 30% 40% 50% 60% 70% 80% 90% 99% 100% obj_allocated obj_used pages_used pages_per_zspage freeable ... ... - 9 176 0 1 186 129 8 4 - 10 192 1 0 2880 2872 135 3 - 11 208 0 1 819 795 42 2 - 12 224 0 1 219 159 12 4 + 30 512 0 12 4 1 0 1 0 0 1 0 414 3464 3346 433 1 14 + 31 528 2 7 2 2 1 0 1 0 0 2 117 4154 3793 536 4 44 + 32 544 6 3 4 1 2 1 0 0 0 1 260 4170 3965 556 2 26 ... ... @@ -54,10 +53,28 @@ class index size object size zspage stores -almost_empty - the number of ZS_ALMOST_EMPTY zspages(see below) -almost_full - the number of ZS_ALMOST_FULL zspages(see below) +10% + the number of zspages with usage ratio less than 10% (see below) +20% + the number of zspages with usage ratio between 10% and 20% +30% + the number of zspages with usage ratio between 20% and 30% +40% + the number of zspages with usage ratio between 30% and 40% +50% + the number of zspages with usage ratio between 40% and 50% +60% + the number of zspages with usage ratio between 50% and 60% +70% + the number of zspages with usage ratio between 60% and 70% +80% + the number of zspages with usage ratio between 70% and 80% +90% + the number of zspages with usage ratio between 80% and 90% +99% + the number of zspages with usage ratio between 90% and 99% +100% + the number of zspages with usage ratio 100% obj_allocated the number of objects allocated obj_used @@ -66,19 +83,14 @@ pages_used the number of pages allocated for the class pages_per_zspage the number of 0-order pages to make a zspage +freeable + the approximate number of pages class compaction can free -We assign a zspage to ZS_ALMOST_EMPTY fullness group when n <= N / f, where - -* n = number of allocated objects -* N = total number of objects zspage can store -* f = fullness_threshold_frac(ie, 4 at the moment) - -Similarly, we assign zspage to: - -* ZS_ALMOST_FULL when n > N / f -* ZS_EMPTY when n == 0 -* ZS_FULL when n == N - +Each zspage maintains inuse counter which keeps track of the number of +objects stored in the zspage. The inuse counter determines the zspage's +"fullness group" which is calculated as the ratio of the "inuse" objects to +the total number of objects the zspage can hold (objs_per_zspage). The +closer the inuse counter is to objs_per_zspage, the better. Internals ========= @@ -94,10 +106,10 @@ of objects that each zspage can store. For instance, consider the following size classes::: - class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable + class size 10% .... 100% obj_allocated obj_used pages_used pages_per_zspage freeable ... - 94 1536 0 0 0 0 0 3 0 - 100 1632 0 0 0 0 0 2 0 + 94 1536 0 .... 0 0 0 0 3 0 + 100 1632 0 .... 0 0 0 0 2 0 ... @@ -134,10 +146,11 @@ reduces memory wastage. Let's take a closer look at the bottom of `/sys/kernel/debug/zsmalloc/zramX/classes`::: - class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable + class size 10% .... 100% obj_allocated obj_used pages_used pages_per_zspage freeable + ... - 202 3264 0 0 0 0 0 4 0 - 254 4096 0 0 0 0 0 1 0 + 202 3264 0 .. 0 0 0 0 4 0 + 254 4096 0 .. 0 0 0 0 1 0 ... Size class #202 stores objects of size 3264 bytes and has a maximum of 4 pages @@ -151,40 +164,42 @@ efficient storage of large objects. For zspage chain size of 8, huge class watermark becomes 3632 bytes::: - class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable + class size 10% .... 100% obj_allocated obj_used pages_used pages_per_zspage freeable + ... - 202 3264 0 0 0 0 0 4 0 - 211 3408 0 0 0 0 0 5 0 - 217 3504 0 0 0 0 0 6 0 - 222 3584 0 0 0 0 0 7 0 - 225 3632 0 0 0 0 0 8 0 - 254 4096 0 0 0 0 0 1 0 + 202 3264 0 .. 0 0 0 0 4 0 + 211 3408 0 .. 0 0 0 0 5 0 + 217 3504 0 .. 0 0 0 0 6 0 + 222 3584 0 .. 0 0 0 0 7 0 + 225 3632 0 .. 0 0 0 0 8 0 + 254 4096 0 .. 0 0 0 0 1 0 ... For zspage chain size of 16, huge class watermark becomes 3840 bytes::: - class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable + class size 10% .... 100% obj_allocated obj_used pages_used pages_per_zspage freeable + ... - 202 3264 0 0 0 0 0 4 0 - 206 3328 0 0 0 0 0 13 0 - 207 3344 0 0 0 0 0 9 0 - 208 3360 0 0 0 0 0 14 0 - 211 3408 0 0 0 0 0 5 0 - 212 3424 0 0 0 0 0 16 0 - 214 3456 0 0 0 0 0 11 0 - 217 3504 0 0 0 0 0 6 0 - 219 3536 0 0 0 0 0 13 0 - 222 3584 0 0 0 0 0 7 0 - 223 3600 0 0 0 0 0 15 0 - 225 3632 0 0 0 0 0 8 0 - 228 3680 0 0 0 0 0 9 0 - 230 3712 0 0 0 0 0 10 0 - 232 3744 0 0 0 0 0 11 0 - 234 3776 0 0 0 0 0 12 0 - 235 3792 0 0 0 0 0 13 0 - 236 3808 0 0 0 0 0 14 0 - 238 3840 0 0 0 0 0 15 0 - 254 4096 0 0 0 0 0 1 0 + 202 3264 0 .. 0 0 0 0 4 0 + 206 3328 0 .. 0 0 0 0 13 0 + 207 3344 0 .. 0 0 0 0 9 0 + 208 3360 0 .. 0 0 0 0 14 0 + 211 3408 0 .. 0 0 0 0 5 0 + 212 3424 0 .. 0 0 0 0 16 0 + 214 3456 0 .. 0 0 0 0 11 0 + 217 3504 0 .. 0 0 0 0 6 0 + 219 3536 0 .. 0 0 0 0 13 0 + 222 3584 0 .. 0 0 0 0 7 0 + 223 3600 0 .. 0 0 0 0 15 0 + 225 3632 0 .. 0 0 0 0 8 0 + 228 3680 0 .. 0 0 0 0 9 0 + 230 3712 0 .. 0 0 0 0 10 0 + 232 3744 0 .. 0 0 0 0 11 0 + 234 3776 0 .. 0 0 0 0 12 0 + 235 3792 0 .. 0 0 0 0 13 0 + 236 3808 0 .. 0 0 0 0 14 0 + 238 3840 0 .. 0 0 0 0 15 0 + 254 4096 0 .. 0 0 0 0 1 0 ... Overall the combined zspage chain size effect on zsmalloc pool configuration::: @@ -214,9 +229,10 @@ zram as a build artifacts storage (Linux kernel compilation). zsmalloc classes stats::: - class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable + class size 10% .... 100% obj_allocated obj_used pages_used pages_per_zspage freeable + ... - Total 13 51 413836 412973 159955 3 + Total 13 .. 51 413836 412973 159955 3 zram mm_stat::: @@ -227,9 +243,10 @@ zram as a build artifacts storage (Linux kernel compilation). zsmalloc classes stats::: - class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable + class size 10% .... 100% obj_allocated obj_used pages_used pages_per_zspage freeable + ... - Total 18 87 414852 412978 156666 0 + Total 18 .. 87 414852 412978 156666 0 zram mm_stat::: @@ -781,6 +781,33 @@ out: return ret; } +static int __dax_clear_dirty_range(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + XA_STATE(xas, &mapping->i_pages, start); + unsigned int scanned = 0; + void *entry; + + xas_lock_irq(&xas); + xas_for_each(&xas, entry, end) { + entry = get_unlocked_entry(&xas, 0); + xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); + xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); + put_unlocked_entry(&xas, entry, WAKE_NEXT); + + if (++scanned % XA_CHECK_SCHED) + continue; + + xas_pause(&xas); + xas_unlock_irq(&xas); + cond_resched(); + xas_lock_irq(&xas); + } + xas_unlock_irq(&xas); + + return 0; +} + /* * Delete DAX entry at @index from @mapping. Wait for it * to be unlocked before deleting it. @@ -1440,6 +1467,16 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, * written by write(2) is visible in mmap. */ if (iomap->flags & IOMAP_F_NEW || cow) { + /* + * Filesystem allows CoW on non-shared extents. The src extents + * may have been mmapped with dirty mark before. To be able to + * invalidate its dax entries, we need to clear the dirty mark + * in advance. + */ + if (cow) + __dax_clear_dirty_range(iomi->inode->i_mapping, + pos >> PAGE_SHIFT, + (end - 1) >> PAGE_SHIFT); invalidate_inode_pages2_range(iomi->inode->i_mapping, pos >> PAGE_SHIFT, (end - 1) >> PAGE_SHIFT); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 195dc23e0d83..1db3e3c24b43 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -978,6 +978,16 @@ restart: continue; } + /* + * If wb_tryget fails, the wb has been shutdown, skip it. + * + * Pin @wb so that it stays on @bdi->wb_list. This allows + * continuing iteration from @wb after dropping and + * regrabbing rcu read lock. + */ + if (!wb_tryget(wb)) + continue; + /* alloc failed, execute synchronously using on-stack fallback */ work = &fallback_work; *work = *base_work; @@ -986,13 +996,6 @@ restart: work->done = &fallback_work_done; wb_queue_work(wb, work); - - /* - * Pin @wb so that it stays on @bdi->wb_list. This allows - * continuing iteration from @wb after dropping and - * regrabbing rcu read lock. - */ - wb_get(wb); last_wb = wb; rcu_read_unlock(); diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 2681a449edc1..13592e82eaf6 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -2219,6 +2219,7 @@ static int nilfs_btree_assign_p(struct nilfs_bmap *btree, /* on-disk format */ binfo->bi_dat.bi_blkoff = cpu_to_le64(key); binfo->bi_dat.bi_level = level; + memset(binfo->bi_dat.bi_pad, 0, sizeof(binfo->bi_dat.bi_pad)); return 0; } diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c index a35f2795b242..4c85914f2abc 100644 --- a/fs/nilfs2/direct.c +++ b/fs/nilfs2/direct.c @@ -314,6 +314,7 @@ static int nilfs_direct_assign_p(struct nilfs_bmap *direct, binfo->bi_dat.bi_blkoff = cpu_to_le64(key); binfo->bi_dat.bi_level = 0; + memset(binfo->bi_dat.bi_pad, 0, sizeof(binfo->bi_dat.bi_pad)); return 0; } diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 19446a8243d7..6ad41390fa74 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2609,11 +2609,10 @@ static int nilfs_segctor_thread(void *arg) goto loop; end_thread: - spin_unlock(&sci->sc_state_lock); - /* end sync. */ sci->sc_task = NULL; wake_up(&sci->sc_wait_task); /* for nilfs_segctor_kill_thread() */ + spin_unlock(&sci->sc_state_lock); return 0; } diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 1422b8ba24ed..77f1e5778d1c 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -482,6 +482,7 @@ static void nilfs_put_super(struct super_block *sb) up_write(&nilfs->ns_sem); } + nilfs_sysfs_delete_device_group(nilfs); iput(nilfs->ns_sufile); iput(nilfs->ns_cpfile); iput(nilfs->ns_dat); @@ -1105,6 +1106,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent) nilfs_put_root(fsroot); failed_unload: + nilfs_sysfs_delete_device_group(nilfs); iput(nilfs->ns_sufile); iput(nilfs->ns_cpfile); iput(nilfs->ns_dat); diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 3a4c9c150cbf..2894152a6b25 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -87,7 +87,6 @@ void destroy_nilfs(struct the_nilfs *nilfs) { might_sleep(); if (nilfs_init(nilfs)) { - nilfs_sysfs_delete_device_group(nilfs); brelse(nilfs->ns_sbh[0]); brelse(nilfs->ns_sbh[1]); } @@ -305,6 +304,10 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) goto failed; } + err = nilfs_sysfs_create_device_group(sb); + if (unlikely(err)) + goto sysfs_error; + if (valid_fs) goto skip_recovery; @@ -366,6 +369,9 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) goto failed; failed_unload: + nilfs_sysfs_delete_device_group(nilfs); + + sysfs_error: iput(nilfs->ns_cpfile); iput(nilfs->ns_sufile); iput(nilfs->ns_dat); @@ -697,10 +703,6 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data) if (err) goto failed_sbh; - err = nilfs_sysfs_create_device_group(sb); - if (err) - goto failed_sbh; - set_nilfs_init(nilfs); err = 0; out: diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 8395605790f6..3b2a41c330e6 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1977,8 +1977,10 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, ret = -EFAULT; if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) goto out; - /* Ignore unsupported features (userspace built against newer kernel) */ - features = uffdio_api.features & UFFD_API_FEATURES; + features = uffdio_api.features; + ret = -EINVAL; + if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) + goto err_out; ret = -EPERM; if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE)) goto err_out; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 092f842a854f..3fc9e680f174 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -800,7 +800,8 @@ struct mm_struct { unsigned long cpu_bitmap[]; }; -#define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN) +#define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | \ + MT_FLAGS_USE_RCU) extern struct mm_struct init_mm; /* Pointer magic because the dynamic array size confuses some compilers. */ diff --git a/kernel/fork.c b/kernel/fork.c index 2bde99037652..9051bc07b600 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -683,6 +683,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (retval) goto out; + mt_clear_in_rcu(vmi.mas.tree); for_each_vma(old_vmi, mpnt) { struct file *file; @@ -766,6 +767,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, retval = arch_dup_mmap(oldmm, mm); loop_out: vma_iter_free(&vmi); + if (!retval) + mt_set_in_rcu(vmi.mas.tree); out: mmap_write_unlock(mm); flush_tlb_mm(oldmm); diff --git a/lib/maple_tree.c b/lib/maple_tree.c index ae37a167e25d..5577e6da6fe3 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -185,7 +185,7 @@ static void mt_free_rcu(struct rcu_head *head) */ static void ma_free_rcu(struct maple_node *node) { - node->parent = ma_parent_ptr(node); + WARN_ON(node->parent != ma_parent_ptr(node)); call_rcu(&node->rcu, mt_free_rcu); } @@ -539,11 +539,14 @@ static inline struct maple_node *mte_parent(const struct maple_enode *enode) */ static inline bool ma_dead_node(const struct maple_node *node) { - struct maple_node *parent = (void *)((unsigned long) - node->parent & ~MAPLE_NODE_MASK); + struct maple_node *parent; + /* Do not reorder reads from the node prior to the parent check */ + smp_rmb(); + parent = (void *)((unsigned long) node->parent & ~MAPLE_NODE_MASK); return (parent == node); } + /* * mte_dead_node() - check if the @enode is dead. * @enode: The encoded maple node @@ -555,6 +558,8 @@ static inline bool mte_dead_node(const struct maple_enode *enode) struct maple_node *parent, *node; node = mte_to_node(enode); + /* Do not reorder reads from the node prior to the parent check */ + smp_rmb(); parent = mte_parent(enode); return (parent == node); } @@ -625,6 +630,8 @@ static inline unsigned int mas_alloc_req(const struct ma_state *mas) * @node - the maple node * @type - the node type * + * In the event of a dead node, this array may be %NULL + * * Return: A pointer to the maple node pivots */ static inline unsigned long *ma_pivots(struct maple_node *node, @@ -817,6 +824,11 @@ static inline void *mt_slot(const struct maple_tree *mt, return rcu_dereference_check(slots[offset], mt_locked(mt)); } +static inline void *mt_slot_locked(struct maple_tree *mt, void __rcu **slots, + unsigned char offset) +{ + return rcu_dereference_protected(slots[offset], mt_locked(mt)); +} /* * mas_slot_locked() - Get the slot value when holding the maple tree lock. * @mas: The maple state @@ -828,7 +840,7 @@ static inline void *mt_slot(const struct maple_tree *mt, static inline void *mas_slot_locked(struct ma_state *mas, void __rcu **slots, unsigned char offset) { - return rcu_dereference_protected(slots[offset], mt_locked(mas->tree)); + return mt_slot_locked(mas->tree, slots, offset); } /* @@ -900,6 +912,45 @@ static inline void ma_set_meta(struct maple_node *mn, enum maple_type mt, } /* + * mt_clear_meta() - clear the metadata information of a node, if it exists + * @mt: The maple tree + * @mn: The maple node + * @type: The maple node type + * @offset: The offset of the highest sub-gap in this node. + * @end: The end of the data in this node. + */ +static inline void mt_clear_meta(struct maple_tree *mt, struct maple_node *mn, + enum maple_type type) +{ + struct maple_metadata *meta; + unsigned long *pivots; + void __rcu **slots; + void *next; + + switch (type) { + case maple_range_64: + pivots = mn->mr64.pivot; + if (unlikely(pivots[MAPLE_RANGE64_SLOTS - 2])) { + slots = mn->mr64.slot; + next = mt_slot_locked(mt, slots, + MAPLE_RANGE64_SLOTS - 1); + if (unlikely((mte_to_node(next) && + mte_node_type(next)))) + return; /* no metadata, could be node */ + } + fallthrough; + case maple_arange_64: + meta = ma_meta(mn, type); + break; + default: + return; + } + + meta->gap = 0; + meta->end = 0; +} + +/* * ma_meta_end() - Get the data end of a node from the metadata * @mn: The maple node * @mt: The maple node type @@ -1096,8 +1147,11 @@ static int mas_ascend(struct ma_state *mas) a_type = mas_parent_enum(mas, p_enode); a_node = mte_parent(p_enode); a_slot = mte_parent_slot(p_enode); - pivots = ma_pivots(a_node, a_type); a_enode = mt_mk_node(a_node, a_type); + pivots = ma_pivots(a_node, a_type); + + if (unlikely(ma_dead_node(a_node))) + return 1; if (!set_min && a_slot) { set_min = true; @@ -1249,26 +1303,21 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) node = mas->alloc; node->request_count = 0; while (requested) { - max_req = MAPLE_ALLOC_SLOTS; - if (node->node_count) { - unsigned int offset = node->node_count; - - slots = (void **)&node->slot[offset]; - max_req -= offset; - } else { - slots = (void **)&node->slot; - } - + max_req = MAPLE_ALLOC_SLOTS - node->node_count; + slots = (void **)&node->slot[node->node_count]; max_req = min(requested, max_req); count = mt_alloc_bulk(gfp, max_req, slots); if (!count) goto nomem_bulk; + if (node->node_count == 0) { + node->slot[0]->node_count = 0; + node->slot[0]->request_count = 0; + } + node->node_count += count; allocated += count; node = node->slot[0]; - node->node_count = 0; - node->request_count = 0; requested -= count; } mas->alloc->total = allocated; @@ -1354,12 +1403,16 @@ static inline struct maple_enode *mas_start(struct ma_state *mas) mas->max = ULONG_MAX; mas->depth = 0; +retry: root = mas_root(mas); /* Tree with nodes */ if (likely(xa_is_node(root))) { mas->depth = 1; mas->node = mte_safe_root(root); mas->offset = 0; + if (mte_dead_node(mas->node)) + goto retry; + return NULL; } @@ -1401,6 +1454,9 @@ static inline unsigned char ma_data_end(struct maple_node *node, { unsigned char offset; + if (!pivots) + return 0; + if (type == maple_arange_64) return ma_meta_end(node, type); @@ -1436,6 +1492,9 @@ static inline unsigned char mas_data_end(struct ma_state *mas) return ma_meta_end(node, type); pivots = ma_pivots(node, type); + if (unlikely(ma_dead_node(node))) + return 0; + offset = mt_pivots[type] - 1; if (likely(!pivots[offset])) return ma_meta_end(node, type); @@ -1724,8 +1783,10 @@ static inline void mas_replace(struct ma_state *mas, bool advanced) rcu_assign_pointer(slots[offset], mas->node); } - if (!advanced) + if (!advanced) { + mte_set_node_dead(old_enode); mas_free(mas, old_enode); + } } /* @@ -3659,10 +3720,9 @@ static inline int mas_root_expand(struct ma_state *mas, void *entry) slot++; mas->depth = 1; mas_set_height(mas); - + ma_set_meta(node, maple_leaf_64, 0, slot); /* swap the new root into the tree */ rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); - ma_set_meta(node, maple_leaf_64, 0, slot); return slot; } @@ -3875,18 +3935,13 @@ static inline void *mtree_lookup_walk(struct ma_state *mas) end = ma_data_end(node, type, pivots, max); if (unlikely(ma_dead_node(node))) goto dead_node; - - if (pivots[offset] >= mas->index) - goto next; - do { - offset++; - } while ((offset < end) && (pivots[offset] < mas->index)); - - if (likely(offset > end)) - max = pivots[offset]; + if (pivots[offset] >= mas->index) { + max = pivots[offset]; + break; + } + } while (++offset < end); -next: slots = ma_slots(node, type); next = mt_slot(mas->tree, slots, offset); if (unlikely(ma_dead_node(node))) @@ -4164,6 +4219,7 @@ static inline bool mas_wr_node_store(struct ma_wr_state *wr_mas) done: mas_leaf_set_meta(mas, newnode, dst_pivots, maple_leaf_64, new_end); if (in_rcu) { + mte_set_node_dead(mas->node); mas->node = mt_mk_node(newnode, wr_mas->type); mas_replace(mas, false); } else { @@ -4505,6 +4561,9 @@ static inline int mas_prev_node(struct ma_state *mas, unsigned long min) node = mas_mn(mas); slots = ma_slots(node, mt); pivots = ma_pivots(node, mt); + if (unlikely(ma_dead_node(node))) + return 1; + mas->max = pivots[offset]; if (offset) mas->min = pivots[offset - 1] + 1; @@ -4526,6 +4585,9 @@ static inline int mas_prev_node(struct ma_state *mas, unsigned long min) slots = ma_slots(node, mt); pivots = ma_pivots(node, mt); offset = ma_data_end(node, mt, pivots, mas->max); + if (unlikely(ma_dead_node(node))) + return 1; + if (offset) mas->min = pivots[offset - 1] + 1; @@ -4574,6 +4636,7 @@ static inline int mas_next_node(struct ma_state *mas, struct maple_node *node, struct maple_enode *enode; int level = 0; unsigned char offset; + unsigned char node_end; enum maple_type mt; void __rcu **slots; @@ -4597,7 +4660,11 @@ static inline int mas_next_node(struct ma_state *mas, struct maple_node *node, node = mas_mn(mas); mt = mte_node_type(mas->node); pivots = ma_pivots(node, mt); - } while (unlikely(offset == ma_data_end(node, mt, pivots, mas->max))); + node_end = ma_data_end(node, mt, pivots, mas->max); + if (unlikely(ma_dead_node(node))) + return 1; + + } while (unlikely(offset == node_end)); slots = ma_slots(node, mt); pivot = mas_safe_pivot(mas, pivots, ++offset, mt); @@ -4613,6 +4680,9 @@ static inline int mas_next_node(struct ma_state *mas, struct maple_node *node, mt = mte_node_type(mas->node); slots = ma_slots(node, mt); pivots = ma_pivots(node, mt); + if (unlikely(ma_dead_node(node))) + return 1; + offset = 0; pivot = pivots[0]; } @@ -4659,11 +4729,14 @@ static inline void *mas_next_nentry(struct ma_state *mas, return NULL; } - pivots = ma_pivots(node, type); slots = ma_slots(node, type); - mas->index = mas_safe_min(mas, pivots, mas->offset); + pivots = ma_pivots(node, type); count = ma_data_end(node, type, pivots, mas->max); - if (ma_dead_node(node)) + if (unlikely(ma_dead_node(node))) + return NULL; + + mas->index = mas_safe_min(mas, pivots, mas->offset); + if (unlikely(ma_dead_node(node))) return NULL; if (mas->index > max) @@ -4817,6 +4890,11 @@ retry: slots = ma_slots(mn, mt); pivots = ma_pivots(mn, mt); + if (unlikely(ma_dead_node(mn))) { + mas_rewalk(mas, index); + goto retry; + } + if (offset == mt_pivots[mt]) pivot = mas->max; else @@ -5400,24 +5478,26 @@ no_gap: } /* - * mas_dead_leaves() - Mark all leaves of a node as dead. + * mte_dead_leaves() - Mark all leaves of a node as dead. * @mas: The maple state * @slots: Pointer to the slot array + * @type: The maple node type * * Must hold the write lock. * * Return: The number of leaves marked as dead. */ static inline -unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots) +unsigned char mte_dead_leaves(struct maple_enode *enode, struct maple_tree *mt, + void __rcu **slots) { struct maple_node *node; enum maple_type type; void *entry; int offset; - for (offset = 0; offset < mt_slot_count(mas->node); offset++) { - entry = mas_slot_locked(mas, slots, offset); + for (offset = 0; offset < mt_slot_count(enode); offset++) { + entry = mt_slot(mt, slots, offset); type = mte_node_type(entry); node = mte_to_node(entry); /* Use both node and type to catch LE & BE metadata */ @@ -5425,7 +5505,6 @@ unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots) break; mte_set_node_dead(entry); - smp_wmb(); /* Needed for RCU */ node->type = type; rcu_assign_pointer(slots[offset], node); } @@ -5433,151 +5512,160 @@ unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots) return offset; } -static void __rcu **mas_dead_walk(struct ma_state *mas, unsigned char offset) +/** + * mte_dead_walk() - Walk down a dead tree to just before the leaves + * @enode: The maple encoded node + * @offset: The starting offset + * + * Note: This can only be used from the RCU callback context. + */ +static void __rcu **mte_dead_walk(struct maple_enode **enode, unsigned char offset) { struct maple_node *node, *next; void __rcu **slots = NULL; - next = mas_mn(mas); + next = mte_to_node(*enode); do { - mas->node = ma_enode_ptr(next); - node = mas_mn(mas); + *enode = ma_enode_ptr(next); + node = mte_to_node(*enode); slots = ma_slots(node, node->type); - next = mas_slot_locked(mas, slots, offset); + next = rcu_dereference_protected(slots[offset], + lock_is_held(&rcu_callback_map)); offset = 0; } while (!ma_is_leaf(next->type)); return slots; } +/** + * mt_free_walk() - Walk & free a tree in the RCU callback context + * @head: The RCU head that's within the node. + * + * Note: This can only be used from the RCU callback context. + */ static void mt_free_walk(struct rcu_head *head) { void __rcu **slots; struct maple_node *node, *start; - struct maple_tree mt; + struct maple_enode *enode; unsigned char offset; enum maple_type type; - MA_STATE(mas, &mt, 0, 0); node = container_of(head, struct maple_node, rcu); if (ma_is_leaf(node->type)) goto free_leaf; - mt_init_flags(&mt, node->ma_flags); - mas_lock(&mas); start = node; - mas.node = mt_mk_node(node, node->type); - slots = mas_dead_walk(&mas, 0); - node = mas_mn(&mas); + enode = mt_mk_node(node, node->type); + slots = mte_dead_walk(&enode, 0); + node = mte_to_node(enode); do { mt_free_bulk(node->slot_len, slots); offset = node->parent_slot + 1; - mas.node = node->piv_parent; - if (mas_mn(&mas) == node) - goto start_slots_free; - - type = mte_node_type(mas.node); - slots = ma_slots(mte_to_node(mas.node), type); - if ((offset < mt_slots[type]) && (slots[offset])) - slots = mas_dead_walk(&mas, offset); - - node = mas_mn(&mas); + enode = node->piv_parent; + if (mte_to_node(enode) == node) + goto free_leaf; + + type = mte_node_type(enode); + slots = ma_slots(mte_to_node(enode), type); + if ((offset < mt_slots[type]) && + rcu_dereference_protected(slots[offset], + lock_is_held(&rcu_callback_map))) + slots = mte_dead_walk(&enode, offset); + node = mte_to_node(enode); } while ((node != start) || (node->slot_len < offset)); slots = ma_slots(node, node->type); mt_free_bulk(node->slot_len, slots); -start_slots_free: - mas_unlock(&mas); free_leaf: mt_free_rcu(&node->rcu); } -static inline void __rcu **mas_destroy_descend(struct ma_state *mas, - struct maple_enode *prev, unsigned char offset) +static inline void __rcu **mte_destroy_descend(struct maple_enode **enode, + struct maple_tree *mt, struct maple_enode *prev, unsigned char offset) { struct maple_node *node; - struct maple_enode *next = mas->node; + struct maple_enode *next = *enode; void __rcu **slots = NULL; + enum maple_type type; + unsigned char next_offset = 0; do { - mas->node = next; - node = mas_mn(mas); - slots = ma_slots(node, mte_node_type(mas->node)); - next = mas_slot_locked(mas, slots, 0); + *enode = next; + node = mte_to_node(*enode); + type = mte_node_type(*enode); + slots = ma_slots(node, type); + next = mt_slot_locked(mt, slots, next_offset); if ((mte_dead_node(next))) - next = mas_slot_locked(mas, slots, 1); + next = mt_slot_locked(mt, slots, ++next_offset); - mte_set_node_dead(mas->node); - node->type = mte_node_type(mas->node); + mte_set_node_dead(*enode); + node->type = type; node->piv_parent = prev; node->parent_slot = offset; - offset = 0; - prev = mas->node; + offset = next_offset; + next_offset = 0; + prev = *enode; } while (!mte_is_leaf(next)); return slots; } -static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags, +static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt, bool free) { void __rcu **slots; struct maple_node *node = mte_to_node(enode); struct maple_enode *start; - struct maple_tree mt; - - MA_STATE(mas, &mt, 0, 0); - if (mte_is_leaf(enode)) + if (mte_is_leaf(enode)) { + node->type = mte_node_type(enode); goto free_leaf; + } - mt_init_flags(&mt, ma_flags); - mas_lock(&mas); - - mas.node = start = enode; - slots = mas_destroy_descend(&mas, start, 0); - node = mas_mn(&mas); + start = enode; + slots = mte_destroy_descend(&enode, mt, start, 0); + node = mte_to_node(enode); // Updated in the above call. do { enum maple_type type; unsigned char offset; struct maple_enode *parent, *tmp; - node->slot_len = mas_dead_leaves(&mas, slots); + node->slot_len = mte_dead_leaves(enode, mt, slots); if (free) mt_free_bulk(node->slot_len, slots); offset = node->parent_slot + 1; - mas.node = node->piv_parent; - if (mas_mn(&mas) == node) - goto start_slots_free; + enode = node->piv_parent; + if (mte_to_node(enode) == node) + goto free_leaf; - type = mte_node_type(mas.node); - slots = ma_slots(mte_to_node(mas.node), type); + type = mte_node_type(enode); + slots = ma_slots(mte_to_node(enode), type); if (offset >= mt_slots[type]) goto next; - tmp = mas_slot_locked(&mas, slots, offset); + tmp = mt_slot_locked(mt, slots, offset); if (mte_node_type(tmp) && mte_to_node(tmp)) { - parent = mas.node; - mas.node = tmp; - slots = mas_destroy_descend(&mas, parent, offset); + parent = enode; + enode = tmp; + slots = mte_destroy_descend(&enode, mt, parent, offset); } next: - node = mas_mn(&mas); - } while (start != mas.node); + node = mte_to_node(enode); + } while (start != enode); - node = mas_mn(&mas); - node->slot_len = mas_dead_leaves(&mas, slots); + node = mte_to_node(enode); + node->slot_len = mte_dead_leaves(enode, mt, slots); if (free) mt_free_bulk(node->slot_len, slots); -start_slots_free: - mas_unlock(&mas); - free_leaf: if (free) mt_free_rcu(&node->rcu); + else + mt_clear_meta(mt, node, node->type); } /* @@ -5593,10 +5681,10 @@ static inline void mte_destroy_walk(struct maple_enode *enode, struct maple_node *node = mte_to_node(enode); if (mt_in_rcu(mt)) { - mt_destroy_walk(enode, mt->ma_flags, false); + mt_destroy_walk(enode, mt, false); call_rcu(&node->rcu, mt_free_walk); } else { - mt_destroy_walk(enode, mt->ma_flags, true); + mt_destroy_walk(enode, mt, true); } } @@ -6618,11 +6706,11 @@ static inline void *mas_first_entry(struct ma_state *mas, struct maple_node *mn, while (likely(!ma_is_leaf(mt))) { MT_BUG_ON(mas->tree, mte_dead_node(mas->node)); slots = ma_slots(mn, mt); - pivots = ma_pivots(mn, mt); - max = pivots[0]; entry = mas_slot(mas, slots, 0); + pivots = ma_pivots(mn, mt); if (unlikely(ma_dead_node(mn))) return NULL; + max = pivots[0]; mas->node = entry; mn = mas_mn(mas); mt = mte_node_type(mas->node); @@ -6642,13 +6730,13 @@ static inline void *mas_first_entry(struct ma_state *mas, struct maple_node *mn, if (likely(entry)) return entry; - pivots = ma_pivots(mn, mt); - mas->index = pivots[0] + 1; mas->offset = 1; entry = mas_slot(mas, slots, 1); + pivots = ma_pivots(mn, mt); if (unlikely(ma_dead_node(mn))) return NULL; + mas->index = pivots[0] + 1; if (mas->index > limit) goto none; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index a53b9360b72e..30d2d0386fdb 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -507,6 +507,15 @@ static LIST_HEAD(offline_cgwbs); static void cleanup_offline_cgwbs_workfn(struct work_struct *work); static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn); +static void cgwb_free_rcu(struct rcu_head *rcu_head) +{ + struct bdi_writeback *wb = container_of(rcu_head, + struct bdi_writeback, rcu); + + percpu_ref_exit(&wb->refcnt); + kfree(wb); +} + static void cgwb_release_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(work, struct bdi_writeback, @@ -529,11 +538,10 @@ static void cgwb_release_workfn(struct work_struct *work) list_del(&wb->offline_node); spin_unlock_irq(&cgwb_lock); - percpu_ref_exit(&wb->refcnt); wb_exit(wb); bdi_put(bdi); WARN_ON_ONCE(!list_empty(&wb->b_attached)); - kfree_rcu(wb, rcu); + call_rcu(&wb->rcu, cgwb_free_rcu); } static void cgwb_release(struct percpu_ref *refcnt) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 81a5689806af..03d78901a7a7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1830,10 +1830,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (is_swap_pmd(*pmd)) { swp_entry_t entry = pmd_to_swp_entry(*pmd); struct page *page = pfn_swap_entry_to_page(entry); + pmd_t newpmd; VM_BUG_ON(!is_pmd_migration_entry(*pmd)); if (is_writable_migration_entry(entry)) { - pmd_t newpmd; /* * A protection check is difficult so * just be safe and disable write @@ -1847,8 +1847,16 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, newpmd = pmd_swp_mksoft_dirty(newpmd); if (pmd_swp_uffd_wp(*pmd)) newpmd = pmd_swp_mkuffd_wp(newpmd); - set_pmd_at(mm, addr, pmd, newpmd); + } else { + newpmd = *pmd; } + + if (uffd_wp) + newpmd = pmd_swp_mkuffd_wp(newpmd); + else if (uffd_wp_resolve) + newpmd = pmd_swp_clear_uffd_wp(newpmd); + if (!pmd_same(*pmd, newpmd)) + set_pmd_at(mm, addr, pmd, newpmd); goto unlock; } #endif @@ -2649,9 +2657,10 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); is_hzp = is_huge_zero_page(&folio->page); - VM_WARN_ON_ONCE_FOLIO(is_hzp, folio); - if (is_hzp) + if (is_hzp) { + pr_warn_ratelimited("Called split_huge_page for huge zero page\n"); return -EBUSY; + } if (folio_test_writeback(folio)) return -EBUSY; @@ -3242,6 +3251,8 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, pmdswp = swp_entry_to_pmd(entry); if (pmd_soft_dirty(pmdval)) pmdswp = pmd_swp_mksoft_dirty(pmdswp); + if (pmd_uffd_wp(pmdval)) + pmdswp = pmd_swp_mkuffd_wp(pmdswp); set_pmd_at(mm, address, pvmw->pmd, pmdswp); page_remove_rmap(page, vma, true); put_page(page); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8bfd07f4c143..a58b3739ed4b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5478,7 +5478,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, struct folio *pagecache_folio, spinlock_t *ptl) { const bool unshare = flags & FAULT_FLAG_UNSHARE; - pte_t pte; + pte_t pte = huge_ptep_get(ptep); struct hstate *h = hstate_vma(vma); struct page *old_page; struct folio *new_folio; @@ -5488,6 +5488,17 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, struct mmu_notifier_range range; /* + * Never handle CoW for uffd-wp protected pages. It should be only + * handled when the uffd-wp protection is removed. + * + * Note that only the CoW optimization path (in hugetlb_no_page()) + * can trigger this, because hugetlb_fault() will always resolve + * uffd-wp bit first. + */ + if (!unshare && huge_pte_uffd_wp(pte)) + return 0; + + /* * hugetlb does not support FOLL_FORCE-style write faults that keep the * PTE mapped R/O such as maybe_mkwrite() would do. */ @@ -5500,7 +5511,6 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, return 0; } - pte = huge_ptep_get(ptep); old_page = pte_page(pte); delayacct_wpcopy_start(); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index bee7fd7db380..2c6548cd18a9 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -572,6 +572,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, result = SCAN_PTE_NON_PRESENT; goto out; } + if (pte_uffd_wp(pteval)) { + result = SCAN_PTE_UFFD_WP; + goto out; + } page = vm_normal_page(vma, address, pteval); if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; diff --git a/mm/memory.c b/mm/memory.c index f77fccb5310c..387226d6094d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3569,8 +3569,21 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct mmu_notifier_range range; - if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) + /* + * We need a reference to lock the folio because we don't hold + * the PTL so a racing thread can remove the device-exclusive + * entry and unmap it. If the folio is free the entry must + * have been removed already. If it happens to have already + * been re-allocated after being freed all we do is lock and + * unlock it. + */ + if (!folio_try_get(folio)) + return 0; + + if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) { + folio_put(folio); return VM_FAULT_RETRY; + } mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma->vm_mm, vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL); @@ -3583,6 +3596,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) pte_unmap_unlock(vmf->pte, vmf->ptl); folio_unlock(folio); + folio_put(folio); mmu_notifier_invalidate_range_end(&range); return 0; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a256a241fd1d..2068b594dc88 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -790,61 +790,50 @@ static int vma_replace_policy(struct vm_area_struct *vma, return err; } -/* Step 2: apply policy to a range and do splits. */ -static int mbind_range(struct mm_struct *mm, unsigned long start, - unsigned long end, struct mempolicy *new_pol) +/* Split or merge the VMA (if required) and apply the new policy */ +static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long end, struct mempolicy *new_pol) { - VMA_ITERATOR(vmi, mm, start); - struct vm_area_struct *prev; - struct vm_area_struct *vma; - int err = 0; + struct vm_area_struct *merged; + unsigned long vmstart, vmend; pgoff_t pgoff; + int err; - prev = vma_prev(&vmi); - vma = vma_find(&vmi, end); - if (WARN_ON(!vma)) + vmend = min(end, vma->vm_end); + if (start > vma->vm_start) { + *prev = vma; + vmstart = start; + } else { + vmstart = vma->vm_start; + } + + if (mpol_equal(vma_policy(vma), new_pol)) return 0; - if (start > vma->vm_start) - prev = vma; - - do { - unsigned long vmstart = max(start, vma->vm_start); - unsigned long vmend = min(end, vma->vm_end); - - if (mpol_equal(vma_policy(vma), new_pol)) - goto next; - - pgoff = vma->vm_pgoff + - ((vmstart - vma->vm_start) >> PAGE_SHIFT); - prev = vma_merge(&vmi, mm, prev, vmstart, vmend, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, - new_pol, vma->vm_userfaultfd_ctx, - anon_vma_name(vma)); - if (prev) { - vma = prev; - goto replace; - } - if (vma->vm_start != vmstart) { - err = split_vma(&vmi, vma, vmstart, 1); - if (err) - goto out; - } - if (vma->vm_end != vmend) { - err = split_vma(&vmi, vma, vmend, 0); - if (err) - goto out; - } -replace: - err = vma_replace_policy(vma, new_pol); + pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); + merged = vma_merge(vmi, vma->vm_mm, *prev, vmstart, vmend, vma->vm_flags, + vma->anon_vma, vma->vm_file, pgoff, new_pol, + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + if (merged) { + *prev = merged; + return vma_replace_policy(merged, new_pol); + } + + if (vma->vm_start != vmstart) { + err = split_vma(vmi, vma, vmstart, 1); if (err) - goto out; -next: - prev = vma; - } for_each_vma_range(vmi, vma, end); + return err; + } -out: - return err; + if (vma->vm_end != vmend) { + err = split_vma(vmi, vma, vmend, 0); + if (err) + return err; + } + + *prev = vma; + return vma_replace_policy(vma, new_pol); } /* Set the process memory policy */ @@ -1259,6 +1248,8 @@ static long do_mbind(unsigned long start, unsigned long len, nodemask_t *nmask, unsigned long flags) { struct mm_struct *mm = current->mm; + struct vm_area_struct *vma, *prev; + struct vma_iterator vmi; struct mempolicy *new; unsigned long end; int err; @@ -1328,7 +1319,13 @@ static long do_mbind(unsigned long start, unsigned long len, goto up_out; } - err = mbind_range(mm, start, end, new); + vma_iter_init(&vmi, mm, start); + prev = vma_prev(&vmi); + for_each_vma_range(vmi, vma, end) { + err = mbind_range(&vmi, vma, &prev, start, end, new); + if (err) + break; + } if (!err) { int nr_failed = 0; @@ -1489,10 +1486,8 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le unsigned long, home_node, unsigned long, flags) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; + struct vm_area_struct *vma, *prev; struct mempolicy *new, *old; - unsigned long vmstart; - unsigned long vmend; unsigned long end; int err = -ENOENT; VMA_ITERATOR(vmi, mm, start); @@ -1521,6 +1516,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le if (end == start) return 0; mmap_write_lock(mm); + prev = vma_prev(&vmi); for_each_vma_range(vmi, vma, end) { /* * If any vma in the range got policy other than MPOL_BIND @@ -1541,9 +1537,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le } new->home_node = home_node; - vmstart = max(start, vma->vm_start); - vmend = min(end, vma->vm_end); - err = mbind_range(mm, vmstart, vmend, new); + err = mbind_range(&vmi, vma, &prev, start, end, new); mpol_put(new); if (err) break; diff --git a/mm/mmap.c b/mm/mmap.c index 511f656eb423..51cd747884e3 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2309,7 +2309,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, int count = 0; int error = -ENOMEM; MA_STATE(mas_detach, &mt_detach, 0, 0); - mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN); + mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); mt_set_external_lock(&mt_detach, &mm->mmap_lock); /* @@ -3069,6 +3069,7 @@ void exit_mmap(struct mm_struct *mm) */ set_bit(MMF_OOM_SKIP, &mm->flags); mmap_write_lock(mm); + mt_clear_in_rcu(&mm->mm_mt); free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING, true); tlb_finish_mmu(&tlb); diff --git a/mm/mprotect.c b/mm/mprotect.c index b9da9a5f87fe..204194155863 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -869,7 +869,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, } tlb_finish_mmu(&tlb); - if (vma_iter_end(&vmi) < end) + if (!error && vma_iter_end(&vmi) < end) error = -ENOMEM; out: diff --git a/mm/swap.c b/mm/swap.c index 57cb01b042f6..423199ee8478 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -222,7 +222,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); folios_put(fbatch->folios, folio_batch_count(fbatch)); - folio_batch_init(fbatch); + folio_batch_reinit(fbatch); } static void folio_batch_add_and_move(struct folio_batch *fbatch, diff --git a/mm/swapfile.c b/mm/swapfile.c index 00b3e46becad..274bbf797480 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -679,6 +679,7 @@ static void __del_from_avail_list(struct swap_info_struct *p) { int nid; + assert_spin_locked(&p->lock); for_each_node(nid) plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]); } @@ -2434,8 +2435,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); goto out_dput; } - del_from_avail_list(p); spin_lock(&p->lock); + del_from_avail_list(p); if (p->prio < 0) { struct swap_info_struct *si = p; int nid; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5291c6f02cf7..3fa476f17887 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3042,9 +3042,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, * allocation request, free them via vfree() if any. */ if (area->nr_pages != nr_small_pages) { - warn_alloc(gfp_mask, NULL, - "vmalloc error: size %lu, page order %u, failed to allocate pages", - area->nr_pages * PAGE_SIZE, page_order); + /* vm_area_alloc_pages() can also fail due to a fatal signal */ + if (!fatal_signal_pending(current)) + warn_alloc(gfp_mask, NULL, + "vmalloc error: size %lu, page order %u, failed to allocate pages", + area->nr_pages * PAGE_SIZE, page_order); goto fail; } diff --git a/tools/mm/page_owner_sort.c b/tools/mm/page_owner_sort.c index 7c2ac124cdc8..99798894b879 100644 --- a/tools/mm/page_owner_sort.c +++ b/tools/mm/page_owner_sort.c @@ -857,7 +857,7 @@ int main(int argc, char **argv) if (cull & CULL_PID || filter & FILTER_PID) fprintf(fout, ", PID %d", list[i].pid); if (cull & CULL_TGID || filter & FILTER_TGID) - fprintf(fout, ", TGID %d", list[i].pid); + fprintf(fout, ", TGID %d", list[i].tgid); if (cull & CULL_COMM || filter & FILTER_COMM) fprintf(fout, ", task_comm_name: %s", list[i].comm); if (cull & CULL_ALLOCATOR) { diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 958ee9bdb316..4c89ff333f6f 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -108,6 +108,7 @@ static noinline void check_new_node(struct maple_tree *mt) MT_BUG_ON(mt, mn->slot[1] != NULL); MT_BUG_ON(mt, mas_allocated(&mas) != 0); + mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); mas.node = MAS_START; mas_nomem(&mas, GFP_KERNEL); @@ -160,6 +161,7 @@ static noinline void check_new_node(struct maple_tree *mt) MT_BUG_ON(mt, mas_allocated(&mas) != i); MT_BUG_ON(mt, !mn); MT_BUG_ON(mt, not_empty(mn)); + mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); } @@ -192,6 +194,7 @@ static noinline void check_new_node(struct maple_tree *mt) MT_BUG_ON(mt, not_empty(mn)); MT_BUG_ON(mt, mas_allocated(&mas) != i - 1); MT_BUG_ON(mt, !mn); + mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); } @@ -210,6 +213,7 @@ static noinline void check_new_node(struct maple_tree *mt) mn = mas_pop_node(&mas); MT_BUG_ON(mt, not_empty(mn)); MT_BUG_ON(mt, mas_allocated(&mas) != j - 1); + mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); } MT_BUG_ON(mt, mas_allocated(&mas) != 0); @@ -233,6 +237,7 @@ static noinline void check_new_node(struct maple_tree *mt) MT_BUG_ON(mt, mas_allocated(&mas) != i - j); mn = mas_pop_node(&mas); MT_BUG_ON(mt, not_empty(mn)); + mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); MT_BUG_ON(mt, mas_allocated(&mas) != i - j - 1); } @@ -269,6 +274,7 @@ static noinline void check_new_node(struct maple_tree *mt) mn = mas_pop_node(&mas); /* get the next node. */ MT_BUG_ON(mt, mn == NULL); MT_BUG_ON(mt, not_empty(mn)); + mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); } MT_BUG_ON(mt, mas_allocated(&mas) != 0); @@ -294,6 +300,7 @@ static noinline void check_new_node(struct maple_tree *mt) mn = mas_pop_node(&mas2); /* get the next node. */ MT_BUG_ON(mt, mn == NULL); MT_BUG_ON(mt, not_empty(mn)); + mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); } MT_BUG_ON(mt, mas_allocated(&mas2) != 0); @@ -334,10 +341,12 @@ static noinline void check_new_node(struct maple_tree *mt) MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2); mn = mas_pop_node(&mas); MT_BUG_ON(mt, not_empty(mn)); + mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); for (i = 1; i <= MAPLE_ALLOC_SLOTS + 1; i++) { mn = mas_pop_node(&mas); MT_BUG_ON(mt, not_empty(mn)); + mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); } MT_BUG_ON(mt, mas_allocated(&mas) != 0); @@ -375,6 +384,7 @@ static noinline void check_new_node(struct maple_tree *mt) mas_node_count(&mas, i); /* Request */ mas_nomem(&mas, GFP_KERNEL); /* Fill request */ mn = mas_pop_node(&mas); /* get the next node. */ + mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); mas_destroy(&mas); @@ -382,10 +392,13 @@ static noinline void check_new_node(struct maple_tree *mt) mas_node_count(&mas, i); /* Request */ mas_nomem(&mas, GFP_KERNEL); /* Fill request */ mn = mas_pop_node(&mas); /* get the next node. */ + mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); mn = mas_pop_node(&mas); /* get the next node. */ + mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); mn = mas_pop_node(&mas); /* get the next node. */ + mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); mas_destroy(&mas); } @@ -35369,6 +35382,7 @@ static noinline void check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, allocated != 1 + height * 3); mn = mas_pop_node(&mas); MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); + mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); mas_destroy(&mas); @@ -35386,6 +35400,7 @@ static noinline void check_prealloc(struct maple_tree *mt) mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); + mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); @@ -35756,6 +35771,7 @@ void farmer_tests(void) tree.ma_root = mt_mk_node(node, maple_leaf_64); mt_dump(&tree); + node->parent = ma_parent_ptr(node); ma_free_rcu(node); /* Check things that will make lockdep angry */ |