From 9ec1e972c3de3106140c18d2a1c7c74795d85a69 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:16 -0500 Subject: maple_tree: introduce maple_copy node and use it in mas_spanning_rebalance() Introduce an internal-memory only node type called maple_copy to facilitate internal copy operations. Use it in mas_spanning_rebalance() for just the leaf nodes. Initially, the maple_copy node is used to configure the source nodes and copy the data into the big_node. The maple_copy contains a list of source entries with start and end offsets. One of the maple_copy entries can be itself with an offset of 0 to 2, representing the data where the store partially overwrites entries, or fully overwrites the entry. The side effect is that the source nodes no longer have to worry about partially copying the existing offset if it is not fully overwritten. This is in preparation of removal of the maple big_node, but for the time being the data is copied to the big node to limit the change size. Link: https://lkml.kernel.org/r/20260130205935.2559335-12-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 7b8aad47121e..9bc7fa89bc2e 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -139,6 +139,7 @@ enum maple_type { maple_leaf_64, maple_range_64, maple_arange_64, + maple_copy, }; enum store_type { @@ -154,6 +155,30 @@ enum store_type { wr_slot_store, }; +struct maple_copy { + struct { + struct maple_node *node; + unsigned long max; + unsigned char start; + unsigned char end; + enum maple_type mt; + } src[4]; + /* Simulated node */ + void __rcu *slot[3]; + unsigned long min; + union { + unsigned long pivot[3]; + struct { + void *_pad[2]; + unsigned long max; + }; + }; + unsigned char end; + + /*Avoid passing these around */ + unsigned char s_count; +}; + /** * DOC: Maple tree flags * @@ -299,6 +324,7 @@ struct maple_node { }; struct maple_range_64 mr64; struct maple_arange_64 ma64; + struct maple_copy cp; }; }; -- cgit v1.2.3 From 6953038cab845f3720ec8d83915f4f083861e195 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:19 -0500 Subject: maple_tree: change initial big node setup in mas_wr_spanning_rebalance() Instead of copying the data into the big node and finding out that the data may need to be moved or appended to, calculate the data space up front (in the maple copy node) and set up another source for the copy. The additional copy source is tracked in the maple state sib (short for sibling), and is put into the maple write states for future operations after the data is in the big node. To facilitate the newly moved node, some initial setup of the maple subtree state are relocated after the potential shift caused by the new way of rebalancing against a sibling. Link: https://lkml.kernel.org/r/20260130205935.2559335-15-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 1 + lib/maple_tree.c | 175 +++++++++++++++++++++++++++++++++++++++------ 2 files changed, 153 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 9bc7fa89bc2e..e99e16ac1c6d 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -177,6 +177,7 @@ struct maple_copy { /*Avoid passing these around */ unsigned char s_count; + unsigned char data; }; /** diff --git a/lib/maple_tree.c b/lib/maple_tree.c index a9b7e398c7db..0d6f810a4a1f 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1304,6 +1304,18 @@ static inline unsigned char mas_data_end(struct ma_state *mas) return mt_pivots[type]; } +static inline +void wr_mas_setup(struct ma_wr_state *wr_mas, struct ma_state *mas) +{ + wr_mas->node = mas_mn(mas); + wr_mas->type = mte_node_type(mas->node); + wr_mas->pivots = ma_pivots(wr_mas->node, wr_mas->type); + wr_mas->slots = ma_slots(wr_mas->node, wr_mas->type); + wr_mas->r_min = mas_safe_min(mas, wr_mas->pivots, mas->offset); + wr_mas->r_max = mas_safe_pivot(mas, wr_mas->pivots, mas->offset, + wr_mas->type); +} + /* * mas_leaf_max_gap() - Returns the largest gap in a leaf node * @mas: the maple state @@ -2258,6 +2270,44 @@ static inline void mte_mid_split_check(struct maple_enode **l, *split = mid_split; } +static inline +void spanning_sib(struct ma_wr_state *l_wr_mas, + struct ma_wr_state *r_wr_mas, struct ma_state *nneighbour) +{ + struct ma_state l_tmp = *l_wr_mas->mas; + struct ma_state r_tmp = *r_wr_mas->mas; + unsigned char depth = 0; + + do { + mas_ascend(&r_tmp); + mas_ascend(&l_tmp); + depth++; + if (r_tmp.offset < mas_data_end(&r_tmp)) { + r_tmp.offset++; + mas_descend(&r_tmp); + r_tmp.offset = 0; + while (--depth) + mas_descend(&r_tmp); + + r_tmp.end = mas_data_end(&r_tmp); + *nneighbour = r_tmp; + return; + } else if (l_tmp.offset) { + l_tmp.offset--; + do { + mas_descend(&l_tmp); + l_tmp.offset = mas_data_end(&l_tmp); + } while (--depth); + + l_tmp.end = l_tmp.offset; + *nneighbour = l_tmp; + return; + } + } while (!mte_is_root(r_tmp.node)); + + WARN_ON_ONCE(1); +} + /* * mast_set_split_parents() - Helper function to set three nodes parents. Slot * is taken from @mast->l. @@ -2642,6 +2692,49 @@ static inline void cp_leaf_init(struct maple_copy *cp, cp->end = end; } +/* + * cp_data_calc() - Calculate the size of the data (1 indexed). + * @cp: The maple copy struct with the new data populated. + * @l_wr_mas: The maple write state containing the data to the left of the write + * @r_wr_mas: The maple write state containing the data to the right of the + * write + * + * cp->data is a size (not indexed by 0). + */ +static inline void cp_data_calc(struct maple_copy *cp, + struct ma_wr_state *l_wr_mas, struct ma_wr_state *r_wr_mas) +{ + + /* Add 1 every time for the 0th element */ + cp->data = l_wr_mas->mas->offset; + /* Add the new data and any partial overwrites */ + cp->data += cp->end + 1; + /* Data from right (offset + 1 to end), +1 for zero */ + cp->data += r_wr_mas->mas->end - r_wr_mas->offset_end; +} + +static inline void append_mas_cp(struct maple_copy *cp, + struct ma_state *mas, unsigned char start, unsigned char end) +{ + struct maple_node *node; + enum maple_type mt; + unsigned char count; + + count = cp->s_count; + node = mas_mn(mas); + mt = mte_node_type(mas->node); + cp->src[count].node = node; + cp->src[count].mt = mt; + if (mas->end <= end) + cp->src[count].max = mas->max; + else + cp->src[count].max = ma_pivots(node, mt)[end]; + + cp->src[count].start = start; + cp->src[count].end = end; + cp->s_count++; +} + static inline void append_wr_mas_cp(struct maple_copy *cp, struct ma_wr_state *wr_mas, unsigned char start, unsigned char end) { @@ -2670,6 +2763,42 @@ static inline void init_cp_src(struct maple_copy *cp) cp->s_count++; } +/* + * multi_src_setup() - Set the @cp node up with multiple sources to copy from. + * @cp: The maple copy node + * @l_wr_mas: The left write maple state + * @r_wr_mas: The right write maple state + * @sib: The sibling maple state + * + * Note: @sib->end == 0 indicates no sibling will be used. + */ +static inline +void multi_src_setup(struct maple_copy *cp, struct ma_wr_state *l_wr_mas, + struct ma_wr_state *r_wr_mas, struct ma_state *sib) +{ + cp->s_count = 0; + if (sib->end && sib->max < l_wr_mas->mas->min) + append_mas_cp(cp, sib, 0, sib->end); + + /* Copy left 0 - offset */ + if (l_wr_mas->mas->offset) { + unsigned char off = l_wr_mas->mas->offset - 1; + + append_wr_mas_cp(cp, l_wr_mas, 0, off); + cp->src[cp->s_count - 1].max = cp->min - 1; + } + + init_cp_src(cp); + + /* Copy right either from offset or offset + 1 pending on r_max */ + if (r_wr_mas->mas->end != r_wr_mas->offset_end) + append_wr_mas_cp(cp, r_wr_mas, r_wr_mas->offset_end + 1, + r_wr_mas->mas->end); + + if (sib->end && sib->min > r_wr_mas->mas->max) + append_mas_cp(cp, sib, 0, sib->end); +} + static inline void cp_data_write(struct maple_copy *cp, struct maple_big_node *b_node) { @@ -2873,36 +3002,42 @@ static noinline void mas_wr_spanning_rebalance(struct ma_state *mas, struct maple_big_node b_node; struct maple_copy cp; unsigned char height; + struct ma_state sib; MA_STATE(l_mas, mas->tree, mas->index, mas->index); MA_STATE(r_mas, mas->tree, mas->index, mas->last); MA_STATE(m_mas, mas->tree, mas->index, mas->index); MA_STATE(mast_l_mas, NULL, 0, 0); - mast_l_mas = *mas; - mast.orig_l = &mast_l_mas; - mast.orig_r = r_wr_mas->mas; memset(&b_node, 0, sizeof(struct maple_big_node)); + mast_l_mas = *mas; cp.s_count = 0; cp_leaf_init(&cp, mas, l_wr_mas, r_wr_mas); - /* Copy left 0 - offset */ - if (l_wr_mas->mas->offset) { - unsigned char off = l_wr_mas->mas->offset - 1; - - append_wr_mas_cp(&cp, l_wr_mas, 0, off); - cp.src[cp.s_count - 1].max = cp.min - 1; + cp_data_calc(&cp, l_wr_mas, r_wr_mas); + if (((l_wr_mas->mas->min != 0) || (r_wr_mas->mas->max != ULONG_MAX)) && + (cp.data <= mt_min_slots[l_wr_mas->type])) { + spanning_sib(l_wr_mas, r_wr_mas, &sib); + cp.data += sib.end + 1; + } else { + sib.end = 0; } - init_cp_src(&cp); - - /* Copy right from offset_end + 1 to end */ - if (r_wr_mas->mas->end != r_wr_mas->offset_end) - append_wr_mas_cp(&cp, r_wr_mas, r_wr_mas->offset_end + 1, - r_wr_mas->mas->end); - - + multi_src_setup(&cp, l_wr_mas, r_wr_mas, &sib); b_node.type = l_wr_mas->type; cp_data_write(&cp, &b_node); + if (sib.end) { + if (sib.max < l_wr_mas->mas->min) { + *l_wr_mas->mas = sib; + wr_mas_setup(l_wr_mas, &sib); + mast_l_mas = sib; + } else { + *r_wr_mas->mas = sib; + wr_mas_setup(r_wr_mas, &sib); + } + } + + mast.orig_l = &mast_l_mas; + mast.orig_r = r_wr_mas->mas; /* Stop spanning searches by searching for just index. */ mast.orig_l->last = mas->index; @@ -2917,12 +3052,6 @@ static noinline void mas_wr_spanning_rebalance(struct ma_state *mas, mast.m = &m_mas; mast.r = &r_mas; l_mas.status = r_mas.status = m_mas.status = ma_none; - - /* Check if this is not root and has sufficient data. */ - if (((mast.orig_l->min != 0) || (mast.orig_r->max != ULONG_MAX)) && - unlikely(mast.bn->b_end <= mt_min_slots[mast.bn->type])) - mast_spanning_rebalance(&mast); - height = mas_mt_height(mas) + 1; /* -- cgit v1.2.3 From 20b20162e1f3b7e60cf0e79116fb2f3bdef3dc5e Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:21 -0500 Subject: maple_tree: add gap support, slot and pivot sizes for maple copy Add plumbing work for using maple copy as a normal node for a source of copy operations. This is needed later. Link: https://lkml.kernel.org/r/20260130205935.2559335-17-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 1 + lib/maple_tree.c | 5 +++++ 2 files changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index e99e16ac1c6d..db6a02788902 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -165,6 +165,7 @@ struct maple_copy { } src[4]; /* Simulated node */ void __rcu *slot[3]; + unsigned long gap[3]; unsigned long min; union { unsigned long pivot[3]; diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 499cae720251..9c701ee7412c 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -101,6 +101,7 @@ static const unsigned long mt_max[] = { [maple_leaf_64] = ULONG_MAX, [maple_range_64] = ULONG_MAX, [maple_arange_64] = ULONG_MAX, + [maple_copy] = ULONG_MAX, }; #define mt_node_max(x) mt_max[mte_node_type(x)] #endif @@ -110,6 +111,7 @@ static const unsigned char mt_slots[] = { [maple_leaf_64] = MAPLE_RANGE64_SLOTS, [maple_range_64] = MAPLE_RANGE64_SLOTS, [maple_arange_64] = MAPLE_ARANGE64_SLOTS, + [maple_copy] = 3, }; #define mt_slot_count(x) mt_slots[mte_node_type(x)] @@ -118,6 +120,7 @@ static const unsigned char mt_pivots[] = { [maple_leaf_64] = MAPLE_RANGE64_SLOTS - 1, [maple_range_64] = MAPLE_RANGE64_SLOTS - 1, [maple_arange_64] = MAPLE_ARANGE64_SLOTS - 1, + [maple_copy] = 3, }; #define mt_pivot_count(x) mt_pivots[mte_node_type(x)] @@ -126,6 +129,7 @@ static const unsigned char mt_min_slots[] = { [maple_leaf_64] = (MAPLE_RANGE64_SLOTS / 2) - 2, [maple_range_64] = (MAPLE_RANGE64_SLOTS / 2) - 2, [maple_arange_64] = (MAPLE_ARANGE64_SLOTS / 2) - 1, + [maple_copy] = 1, /* Should never be used */ }; #define mt_min_slot_count(x) mt_min_slots[mte_node_type(x)] @@ -627,6 +631,7 @@ static inline unsigned long *ma_gaps(struct maple_node *node, case maple_arange_64: return node->ma64.gap; case maple_copy: + return node->cp.gap; case maple_range_64: case maple_leaf_64: case maple_dense: -- cgit v1.2.3 From a9c6716e088a1d4badd4fa6797469506bb99ec8b Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:22 -0500 Subject: maple_tree: start using maple copy node for destination Stop using the maple subtree state and big node in favour of using three destinations in the maple copy node. That is, expand the way leaves were handled to all levels of the tree and use the maple copy node to track the new nodes. Extract out the sibling init into the data calculation since this is where the insufficient data can be detected. The remainder of the sibling code to shift the next iteration is moved to the spanning_ascend() function, since it is not always needed. Next introduce the dst_setup() function which will decide how many nodes are needed to contain the data at this level. Using the destination count, populate the copy node's dst array with the new nodes and set d_count to the correct value. Note that this can be tricky in the case of a leaf node with exactly enough room because of the rule against NULLs at the end of leaves. Once the destinations are ready, copy the data by altering the cp_data_write() function to copy from the sources to the destinations directly. This eliminates the use of the big node in this code path. On node completion, node_finalise() will zero out the remaining area and set the metadata, if necessary. spanning_ascend() is used to decide if the operation is complete. It may create a new root, converge into one destination, or continue upwards by ascending the left and right write maple states. One test case setup needed to be tweaked so that the targeted node was surrounded by full nodes. [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20260130205935.2559335-18-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 14 + lib/maple_tree.c | 624 +++++++++++++++++++++++++++------------ tools/testing/radix-tree/maple.c | 2 +- 3 files changed, 458 insertions(+), 182 deletions(-) (limited to 'include') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index db6a02788902..0c464eade1d6 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -156,6 +156,17 @@ enum store_type { }; struct maple_copy { + /* + * min, max, and pivots are values + * start, end, split are indexes into arrays + * data is a size + */ + + struct { + struct maple_node *node; + unsigned long max; + enum maple_type mt; + } dst[3]; struct { struct maple_node *node; unsigned long max; @@ -178,7 +189,10 @@ struct maple_copy { /*Avoid passing these around */ unsigned char s_count; + unsigned char d_count; + unsigned char split; unsigned char data; + unsigned char height; }; /** diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 9c701ee7412c..4d9e7f00f5c8 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -353,6 +353,13 @@ static inline struct maple_enode *mt_mk_node(const struct maple_node *node, (type << MAPLE_ENODE_TYPE_SHIFT) | MAPLE_ENODE_NULL); } +static inline void ma_init_slot(void __rcu **slot, const struct maple_node *mn, + const enum maple_type mt) +{ + /* WARNING: this is unsafe if the slot is exposed to readers. */ + RCU_INIT_POINTER(*slot, (void *)mt_mk_node(mn, mt)); +} + static inline void *mte_mk_root(const struct maple_enode *node) { return (void *)((unsigned long)node | MAPLE_ROOT_NODE); @@ -1320,6 +1327,21 @@ void wr_mas_setup(struct ma_wr_state *wr_mas, struct ma_state *mas) wr_mas->r_max = mas_safe_pivot(mas, wr_mas->pivots, mas->offset, wr_mas->type); } + +static inline +void wr_mas_ascend(struct ma_wr_state *wr_mas) +{ + struct ma_state *mas = wr_mas->mas; + + mas_ascend(mas); + wr_mas_setup(wr_mas, mas); + mas->end = ma_data_end(wr_mas->node, wr_mas->type, wr_mas->pivots, + mas->max); + /* Careful, this may be wrong.. */ + wr_mas->end_piv = wr_mas->r_max; + wr_mas->offset_end = mas->offset; +} + static inline unsigned long ma_leaf_max_gap(struct maple_node *mn, enum maple_type mt, unsigned long min, unsigned long max, unsigned long *pivots, void __rcu **slots) @@ -2507,6 +2529,112 @@ static inline void mas_wmb_replace(struct ma_state *mas, mas_update_gap(mas); } +/* + * node_copy() - Copy from one node to another. + * + * @mas: The maple state + * @src: The source node + * @start: The offset into the src to start copying + * @size: The size to copy (non-zero) + * @s_max: The source node max + * @s_mt: The source maple node type + * @dst: The destination + * @d_start: The start location in the destination node + * @d_mt: The destination maple node type + */ +static inline +unsigned long node_copy(struct ma_state *mas, struct maple_node *src, + unsigned char start, unsigned char size, unsigned long s_max, + enum maple_type s_mt, struct maple_node *dst, unsigned char d_start, + enum maple_type d_mt) +{ + unsigned long *s_pivots, *d_pivots; + void __rcu **s_slots, **d_slots; + unsigned long *s_gaps, *d_gaps; + unsigned long d_max; + + d_slots = ma_slots(dst, d_mt) + d_start; + d_pivots = ma_pivots(dst, d_mt) + d_start; + s_slots = ma_slots(src, s_mt) + start; + s_pivots = ma_pivots(src, s_mt) + start; + memcpy(d_slots, s_slots, size * sizeof(void __rcu *)); + if (!ma_is_leaf(d_mt) && s_mt == maple_copy) { + struct maple_enode *edst = mt_mk_node(dst, d_mt); + + + for (int i = 0; i < size; i++) + mas_set_parent(mas, + mt_slot_locked(mas->tree, d_slots, i), + edst, d_start + i); + } + + d_gaps = ma_gaps(dst, d_mt); + if (d_gaps) { + s_gaps = ma_gaps(src, s_mt) + start; + d_gaps += d_start; + memcpy(d_gaps, s_gaps, size * sizeof(unsigned long)); + } + + if (start + size - 1 < mt_pivots[s_mt]) + d_max = s_pivots[size - 1]; + else + d_max = s_max; + + if (d_start + size <= mt_pivots[d_mt]) + d_pivots[size - 1] = d_max; + + size--; + if (size) + memcpy(d_pivots, s_pivots, size * sizeof(unsigned long)); + + return d_max; +} + +/* + * node_finalise() - Zero out unused area and populate metadata + * @node: The maple node + * @mt: The maple node type + * @end: The end of the used area + */ +static inline +void node_finalise(struct maple_node *node, enum maple_type mt, + unsigned char end) +{ + unsigned char max_end = mt_slots[mt]; + unsigned char size; + unsigned long *gaps; + unsigned char gap_slot; + + gaps = ma_gaps(node, mt); + if (end < max_end - 1) { + size = max_end - end; + memset(ma_slots(node, mt) + end, 0, size * sizeof(void *)); + + if (gaps) + memset(gaps + end, 0, size * sizeof(unsigned long)); + + if (--size) + memset(ma_pivots(node, mt) + end, 0, size * sizeof(unsigned long)); + } + + gap_slot = 0; + if (gaps && !ma_is_leaf(mt)) { + unsigned long max_gap; + + max_gap = 0; + for (int i = 0; i <= end; i++) + if (gaps[i] > max_gap) { + gap_slot = i; + max_gap = gaps[i]; + } + } + + if (mt == maple_arange_64) + ma_set_meta(node, mt, gap_slot, end - 1); + else if (end <= max_end - 1) + ma_set_meta(node, mt, gap_slot, end - 1); +} + /* * mast_cp_to_nodes() - Copy data out to nodes. * @mast: The maple subtree state @@ -2684,6 +2812,7 @@ static inline void cp_leaf_init(struct maple_copy *cp, * result in buggy code when a compiler reorders the instructions. */ + cp->height = 1; /* Create entries to insert including split entries to left and right */ if (l_wr_mas->r_min < mas->index) { end++; @@ -2726,6 +2855,100 @@ static inline void cp_data_calc(struct maple_copy *cp, cp->data += r_wr_mas->mas->end - r_wr_mas->offset_end; } +/* + * spanning_data() - Calculate the @cp data and populate @sib if insufficient + * @cp: The maple copy node + * @l_wr_mas: The left write maple state + * @r_wr_mas: The right write maple state + * @sib: The maple state of the sibling. + * + * Note: @cp->data is a size and not indexed by 0. @sib->end may be set to 0 to + * indicate it will not be used. + */ +static inline void spanning_data(struct maple_copy *cp, + struct ma_wr_state *l_wr_mas, struct ma_wr_state *r_wr_mas, + struct ma_state *sib) +{ + cp_data_calc(cp, l_wr_mas, r_wr_mas); + if (((l_wr_mas->mas->min != 0) || (r_wr_mas->mas->max != ULONG_MAX)) && + (cp->data <= mt_min_slots[l_wr_mas->type])) { + spanning_sib(l_wr_mas, r_wr_mas, sib); + cp->data += sib->end + 1; + } else { + sib->end = 0; + } +} + +/* + * dst_setup() - Set up one or more destinations for the new data. + * @cp: The maple copy node + * @mas: The maple state + * @mt: The source node type + */ +static inline +void dst_setup(struct maple_copy *cp, struct ma_state *mas, enum maple_type mt) +{ + /* Data is 1 indexed, every src has +1 added. */ + + if (cp->data <= mt_slots[mt]) { + cp->split = cp->data - 1; + cp->d_count = 1; + goto node_setup; + } + + cp->split = (cp->data - 1) / 2; + cp->d_count = 2; + if (cp->data < mt_slots[mt] * 2) + goto node_setup; + + if (cp->data == mt_slots[mt] * 2) { + unsigned char off; + unsigned char s; + + if (!ma_is_leaf(mt)) + goto node_setup; + + /* + * Leaf nodes are a bit tricky because we cannot assume the data + * can fit due to the NULL limitation on node ends. + */ + off = cp->split; + for (s = 0; s < cp->s_count; s++) { + unsigned char s_off; + + s_off = cp->src[s].end - cp->src[s].start; + if (s_off >= off) + break; + + s_off++; + off -= s_off; + } + + off += cp->src[s].start; + if (ma_slots(cp->src[s].node, cp->src[s].mt)[off]) + goto node_setup; + + cp->split++; + if (cp->split < mt_slots[mt]) + goto node_setup; + + cp->split -= 2; + if (cp->data - 2 - cp->split < mt_slots[mt]) + goto node_setup; + + } + + /* No other choice but to 3-way split the data */ + cp->split = (cp->data + 2) / 3; + cp->d_count = 3; + +node_setup: + for (int i = 0; i < cp->d_count; i++) { + cp->dst[i].mt = mt; + cp->dst[i].node = ma_mnode_ptr(mas_pop_node(mas)); + } +} + static inline void append_mas_cp(struct maple_copy *cp, struct ma_state *mas, unsigned char start, unsigned char end) { @@ -2813,38 +3036,153 @@ void multi_src_setup(struct maple_copy *cp, struct ma_wr_state *l_wr_mas, } static inline -void cp_data_write(struct maple_copy *cp, struct maple_big_node *b_node) +void cp_data_write(struct maple_copy *cp, struct ma_state *mas) { - struct maple_node *src; - unsigned char s; + struct maple_node *dst, *src; + unsigned char s, d; + unsigned char dst_offset; + unsigned char data_offset; unsigned char src_end, s_offset; - unsigned long *b_pivots, *cp_pivots; - void __rcu **b_slots, **cp_slots; - enum maple_type s_mt; + unsigned char split; + unsigned long s_max, d_max; + unsigned char dst_size; + enum maple_type s_mt, d_mt; + + data_offset = 0; + s = d = 0; + /* Readability help */ + src = cp->src[s].node; + dst = cp->dst[d].node; + s_offset = cp->src[s].start; + src_end = cp->src[s].end; + split = cp->split; + s_max = cp->src[s].max; + s_mt = cp->src[s].mt; + d_mt = cp->dst[d].mt; + do { + dst_offset = 0; + d_max = 0; + dst = cp->dst[d].node; + d_mt = cp->dst[d].mt; + dst_size = split + 1; - b_node->b_end = 0; + while (dst_size) { + unsigned char size; - s = 0; - b_pivots = b_node->pivot; - b_slots = (void __rcu **)b_node->slot; - do { - unsigned char size; - - src = cp->src[s].node; - s_mt = cp->src[s].mt; - s_offset = cp->src[s].start; - src_end = cp->src[s].end; - size = src_end - s_offset + 1; - cp_pivots = ma_pivots(src, s_mt) + s_offset; - cp_slots = ma_slots(src, s_mt) + s_offset; - memcpy(b_slots, cp_slots, size * sizeof(void __rcu *)); - if (size > 1) - memcpy(b_pivots, cp_pivots, (size - 1) * sizeof(unsigned long)); - b_pivots[size - 1] = cp->src[s].max; - b_pivots += size; - b_slots += size; - b_node->b_end += size; - } while (++s < cp->s_count); + if (src_end - s_offset + 1 < dst_size) + size = src_end - s_offset + 1; + else + size = dst_size; + + d_max = node_copy(mas, src, s_offset, size, s_max, s_mt, + dst, dst_offset, d_mt); + + dst_offset += size; + s_offset += size; + if (s_offset > src_end) { + /* This source is exhausted */ + s++; + if (s >= cp->s_count) { + cp->dst[d].max = d_max; + node_finalise(dst, d_mt, dst_offset); + return; + } + /* Reset local src */ + src = cp->src[s].node; + s_offset = cp->src[s].start; + src_end = cp->src[s].end; + s_max = cp->src[s].max; + s_mt = cp->src[s].mt; + } + + dst_size -= size; + data_offset += size; + } + + split = cp->split; + cp->dst[d].max = d_max; + /* Handle null entries */ + if (cp->dst[d].max != ULONG_MAX && + !ma_slots(dst, d_mt)[dst_offset - 1]) { + if (s_offset == cp->src[s].start) { + s--; + src = cp->src[s].node; + src_end = cp->src[s].end; + s_max = cp->src[s].max; + s_mt = cp->src[s].mt; + s_offset = src_end; + } else { + s_offset--; + } + /* Set dst max and clear pivot */ + split++; + data_offset--; + dst_offset--; + cp->dst[d].max = ma_pivots(dst, d_mt)[dst_offset - 1]; + } + + node_finalise(dst, d_mt, dst_offset); + ++d; /* Next destination */ + if (d == cp->d_count - 1) + split = cp->data - data_offset; + + if (d >= cp->d_count) { + WARN_ON(data_offset < cp->data); + return; + } + + } while (data_offset <= cp->data); +} + +/* + * cp_dst_to_slots() - Migrate the maple copy destination to the maple copy + * slots + * @cp: The maple copy node + * @min: The minimal value represented + * @max: The maximum value represented + * @mas: The maple state + */ +static inline void cp_dst_to_slots(struct maple_copy *cp, unsigned long min, + unsigned long max, struct ma_state *mas) +{ + unsigned char d; + unsigned long slot_min = min; + + for (d = 0; d < cp->d_count; d++) { + struct maple_node *mn = cp->dst[d].node; + enum maple_type mt = cp->dst[d].mt; + unsigned long slot_max = cp->dst[d].max; + + /* + * Warning, see cp_leaf_init() comment and rcu_assign_pointer() + * documentation. Since these are new nodes, there are no + * read-side operations that can view them until they are + * inserted into the tree after an rcu_assign_pointer() call. + */ + ma_init_slot(&cp->slot[d], mn, mt); + cp->pivot[d] = slot_max; + if (mt_is_alloc(mas->tree)) { + if (ma_is_leaf(mt)) { + cp->gap[d] = ma_leaf_max_gap(mn, mt, slot_min, + slot_max, ma_pivots(mn, mt), + ma_slots(mn, mt)); + } else { + unsigned long *gaps = ma_gaps(mn, mt); + + if (gaps) { + unsigned char gap_slot; + + gap_slot = ma_meta_gap(mn); + cp->gap[d] = gaps[gap_slot]; + } + } + } + slot_min = slot_max + 1; + } + + cp->end = cp->d_count - 1; + cp->min = min; + cp->max = max; } static void mas_spanning_rebalance_loop(struct ma_state *mas, @@ -3000,173 +3338,97 @@ static void mas_spanning_rebalance(struct ma_state *mas, mas_spanning_rebalance_loop(mas, mast, count); } - -static noinline void mas_wr_spanning_rebalance(struct ma_state *mas, - struct ma_wr_state *l_wr_mas, struct ma_wr_state *r_wr_mas) +/* + * spanning_ascend() - See if a spanning store operation has to keep walking up + * the tree + * @cp: The maple_copy node + * @l_wr_mas: The left maple write state + * @r_wr_mas: The right maple write state + * @sib: the maple state of the sibling + * + * Returns: True if another iteration is necessary. + */ +static bool spanning_ascend(struct maple_copy *cp, struct ma_state *mas, + struct ma_wr_state *l_wr_mas, struct ma_wr_state *r_wr_mas, + struct ma_state *sib) { - - unsigned char split, mid_split; - unsigned char slot = 0; - unsigned char new_height = 0; /* used if node is a new root */ - struct maple_enode *left = NULL, *middle = NULL, *right = NULL; - struct maple_enode *old_enode; - - struct maple_subtree_state mast; - struct maple_big_node b_node; - struct maple_copy cp; - unsigned char height; - struct ma_state sib; - MA_STATE(l_mas, mas->tree, mas->index, mas->index); - MA_STATE(r_mas, mas->tree, mas->index, mas->last); - MA_STATE(m_mas, mas->tree, mas->index, mas->index); - MA_STATE(mast_l_mas, NULL, 0, 0); - - - memset(&b_node, 0, sizeof(struct maple_big_node)); - mast_l_mas = *mas; - cp.s_count = 0; - cp_leaf_init(&cp, mas, l_wr_mas, r_wr_mas); - cp_data_calc(&cp, l_wr_mas, r_wr_mas); - if (((l_wr_mas->mas->min != 0) || (r_wr_mas->mas->max != ULONG_MAX)) && - (cp.data <= mt_min_slots[l_wr_mas->type])) { - spanning_sib(l_wr_mas, r_wr_mas, &sib); - cp.data += sib.end + 1; - } else { - sib.end = 0; - } - - multi_src_setup(&cp, l_wr_mas, r_wr_mas, &sib); - b_node.type = l_wr_mas->type; - cp_data_write(&cp, &b_node); - if (sib.end) { - if (sib.max < l_wr_mas->mas->min) { - *l_wr_mas->mas = sib; - wr_mas_setup(l_wr_mas, &sib); - mast_l_mas = sib; - } else { - *r_wr_mas->mas = sib; - wr_mas_setup(r_wr_mas, &sib); - } + if (sib->end) { + if (sib->max < l_wr_mas->mas->min) + *l_wr_mas->mas = *sib; + else + *r_wr_mas->mas = *sib; } - mast.orig_l = &mast_l_mas; - mast.orig_r = r_wr_mas->mas; - /* Stop spanning searches by searching for just index. */ - mast.orig_l->last = mas->index; + cp_dst_to_slots(cp, l_wr_mas->mas->min, r_wr_mas->mas->max, mas); + if (!cp->min && cp->max == ULONG_MAX) { + /* New root */ + if (cp->d_count != 1) { + enum maple_type mt = maple_arange_64; - mast.bn = &b_node; - /* Combine l_mas and r_mas and split them up evenly again. */ + if (!mt_is_alloc(mas->tree)) + mt = maple_range_64; - /* - * The tree needs to be rebalanced and leaves need to be kept at the same level. - * Rebalancing is done by use of the ``struct maple_topiary``. - */ - mast.l = &l_mas; - mast.m = &m_mas; - mast.r = &r_mas; - l_mas.status = r_mas.status = m_mas.status = ma_none; - height = mas_mt_height(mas) + 1; - - /* - * Each level of the tree is examined and balanced, pushing data to the left or - * right, or rebalancing against left or right nodes is employed to avoid - * rippling up the tree to limit the amount of churn. Once a new sub-section of - * the tree is created, there may be a mix of new and old nodes. The old nodes - * will have the incorrect parent pointers and currently be in two trees: the - * original tree and the partially new tree. To remedy the parent pointers in - * the old tree, the new data is swapped into the active tree and a walk down - * the tree is performed and the parent pointers are updated. - * See mas_topiary_replace() for more information. - */ - while (height--) { - mast.bn->b_end--; - mast.bn->type = mte_node_type(mast.orig_l->node); - split = mas_mab_to_node(mas, mast.bn, &left, &right, &middle, - &mid_split); - mast_set_split_parents(&mast, left, middle, right, split, - mid_split); - mast_cp_to_nodes(&mast, left, middle, right, split, mid_split); - new_height++; - - /* - * Copy data from next level in the tree to mast.bn from next - * iteration - */ - memset(mast.bn, 0, sizeof(struct maple_big_node)); - mast.bn->type = mte_node_type(left); - - /* Root already stored in l->node. */ - if (mas_is_root_limits(mast.l)) - goto new_root; - - mast_ascend(&mast); - mast_combine_cp_left(&mast); - mast.l->offset = mast.bn->b_end; - mab_set_b_end(mast.bn, mast.l, left); - mab_set_b_end(mast.bn, mast.m, middle); - mab_set_b_end(mast.bn, mast.r, right); - - /* Copy anything necessary out of the right node. */ - mast_combine_cp_right(&mast); - mast.orig_l->last = mast.orig_l->max; - - if (mast_sufficient(&mast)) { - if (mast_overflow(&mast)) - continue; - - if (mast.orig_l->node == mast.orig_r->node) { - /* - * The data in b_node should be stored in one - * node and in the tree - */ - slot = mast.l->offset; - break; - } - - continue; + cp->data = cp->d_count; + cp->s_count = 0; + dst_setup(cp, mas, mt); + init_cp_src(cp); + node_copy(mas, cp->src[0].node, 0, cp->data, cp->max, maple_copy, + cp->dst[0].node, 0, mt); + node_finalise(cp->dst[0].node, mt, cp->end + 1); + /* + * Warning, see cp_leaf_init() comment and rcu_assign_pointer() + * documentation. Since this is a new root, there are no + * read-side operations that can view it until it is insert into + * the tree after an rcu_assign_pointer() call. + */ + ma_init_slot(&cp->slot[0], cp->dst[0].node, mt); + cp->height++; } - - /* May be a new root stored in mast.bn */ - if (mas_is_root_limits(mast.orig_l)) - break; - - mast_spanning_rebalance(&mast); - - /* rebalancing from other nodes may require another loop. */ - if (!height) - height++; + WARN_ON_ONCE(cp->dst[0].node != mte_to_node( + mt_slot_locked(mas->tree, cp->slot, 0))); + cp->dst[0].node->parent = ma_parent_ptr(mas_tree_parent(mas)); + mas->min = 0; + mas->max = ULONG_MAX; + mas->depth = 0; + mas->node = mas_root_locked(mas); + return false; } - mast.l->node = mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), - mte_node_type(mast.orig_l->node)); + /* Converged and has a single destination */ + if ((cp->d_count == 1) && + (l_wr_mas->mas->node == r_wr_mas->mas->node)) { + cp->dst[0].node->parent = ma_parent_ptr(mas_mn(mas)->parent); + return false; + } - mab_mas_cp(mast.bn, 0, mt_slots[mast.bn->type] - 1, mast.l, true); - new_height++; - mas_set_parent(mas, left, mast.l->node, slot); - if (middle) - mas_set_parent(mas, middle, mast.l->node, ++slot); + cp->height++; + wr_mas_ascend(l_wr_mas); + wr_mas_ascend(r_wr_mas); + return true; +} - if (right) - mas_set_parent(mas, right, mast.l->node, ++slot); +static noinline void mas_wr_spanning_rebalance(struct ma_state *mas, + struct ma_wr_state *l_wr_mas, struct ma_wr_state *r_wr_mas) +{ - if (mas_is_root_limits(mast.l)) { -new_root: - mas_mn(mast.l)->parent = ma_parent_ptr(mas_tree_parent(mas)); - while (!mte_is_root(mast.orig_l->node)) - mast_ascend(&mast); - } else { - mas_mn(mast.l)->parent = mas_mn(mast.orig_l)->parent; - } + struct maple_enode *old_enode; + struct maple_copy cp; + struct ma_state sib; - old_enode = mast.orig_l->node; - mas->depth = mast.l->depth; - mas->node = mast.l->node; - mas->min = mast.l->min; - mas->max = mast.l->max; - mas->offset = mast.l->offset; - mas_wmb_replace(mas, old_enode, new_height); + cp_leaf_init(&cp, mas, l_wr_mas, r_wr_mas); + do { + spanning_data(&cp, l_wr_mas, r_wr_mas, &sib); + multi_src_setup(&cp, l_wr_mas, r_wr_mas, &sib); + dst_setup(&cp, mas, l_wr_mas->type); + cp_data_write(&cp, mas); + } while (spanning_ascend(&cp, mas, l_wr_mas, r_wr_mas, &sib)); + + old_enode = mas->node; + mas->node = mt_slot_locked(mas->tree, cp.slot, 0); + mas_wmb_replace(mas, old_enode, cp.height); mtree_range_walk(mas); } + /* * mas_rebalance() - Rebalance a given node. * @mas: The maple state diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 85fb5616c133..dfd7099f0d8e 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -35508,7 +35508,7 @@ static noinline void __init check_spanning_write(struct maple_tree *mt) /* Store a value across a node boundary that causes a 3 way split */ if (MAPLE_32BIT) - i = 49590; /* 0xc1b6 */ + i = 49430; /* 0xc116 */ else i = 49670; /* 0xC206 */ -- cgit v1.2.3 From e4f4fc7aa8b720d934a0bfcea7f8aae4271d308f Mon Sep 17 00:00:00 2001 From: "JP Kobryn (Meta)" Date: Thu, 19 Feb 2026 15:58:46 -0800 Subject: mm: move pgscan, pgsteal, pgrefill to node stats MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are situations where reclaim kicks in on a system with free memory. One possible cause is a NUMA imbalance scenario where one or more nodes are under pressure. It would help if we could easily identify such nodes. Move the pgscan, pgsteal, and pgrefill counters from vm_event_item to node_stat_item to provide per-node reclaim visibility. With these counters as node stats, the values are now displayed in the per-node section of /proc/zoneinfo, which allows for quick identification of the affected nodes. /proc/vmstat continues to report the same counters, aggregated across all nodes. But the ordering of these items within the readout changes as they move from the vm events section to the node stats section. Memcg accounting of these counters is preserved. The relocated counters remain visible in memory.stat alongside the existing aggregate pgscan and pgsteal counters. However, this change affects how the global counters are accumulated. Previously, the global event count update was gated on !cgroup_reclaim(), excluding memcg-based reclaim from /proc/vmstat. Now that mod_lruvec_state() is being used to update the counters, the global counters will include all reclaim. This is consistent with how pgdemote counters are already tracked. Finally, the virtio_balloon driver is updated to use global_node_page_state() to fetch the counters, as they are no longer accessible through the vm_events array. Link: https://lkml.kernel.org/r/20260219235846.161910-1-jp.kobryn@linux.dev Signed-off-by: JP Kobryn Suggested-by: Johannes Weiner Acked-by: Michael S. Tsirkin Reviewed-by: Vlastimil Babka (SUSE) Acked-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Michal Hocko Cc: Alistair Popple Cc: Axel Rasmussen Cc: Byungchul Park Cc: David Hildenbrand Cc: Eugenio Pérez Cc: Gregory Price Cc: "Huang, Ying" Cc: Jason Wang Cc: Joshua Hahn Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mathew Brost Cc: Mike Rapoport Cc: Muchun Song Cc: Qi Zheng Cc: Rakie Kim Cc: Roman Gushchin Cc: Suren Baghdasaryan Cc: Wei Xu Cc: Xuan Zhuo Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- drivers/virtio/virtio_balloon.c | 8 +++--- include/linux/mmzone.h | 13 ++++++++++ include/linux/vm_event_item.h | 13 ---------- mm/memcontrol.c | 56 ++++++++++++++++++++++++++++------------- mm/vmscan.c | 39 ++++++++++------------------ mm/vmstat.c | 26 +++++++++---------- 6 files changed, 82 insertions(+), 73 deletions(-) (limited to 'include') diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index d1fbc8fe8470..7f15bf162e88 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -369,13 +369,13 @@ static inline unsigned int update_balloon_vm_stats(struct virtio_balloon *vb) update_stat(vb, idx++, VIRTIO_BALLOON_S_ALLOC_STALL, stall); update_stat(vb, idx++, VIRTIO_BALLOON_S_ASYNC_SCAN, - pages_to_bytes(events[PGSCAN_KSWAPD])); + pages_to_bytes(global_node_page_state(PGSCAN_KSWAPD))); update_stat(vb, idx++, VIRTIO_BALLOON_S_DIRECT_SCAN, - pages_to_bytes(events[PGSCAN_DIRECT])); + pages_to_bytes(global_node_page_state(PGSCAN_DIRECT))); update_stat(vb, idx++, VIRTIO_BALLOON_S_ASYNC_RECLAIM, - pages_to_bytes(events[PGSTEAL_KSWAPD])); + pages_to_bytes(global_node_page_state(PGSTEAL_KSWAPD))); update_stat(vb, idx++, VIRTIO_BALLOON_S_DIRECT_RECLAIM, - pages_to_bytes(events[PGSTEAL_DIRECT])); + pages_to_bytes(global_node_page_state(PGSTEAL_DIRECT))); #ifdef CONFIG_HUGETLB_PAGE update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC, diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3e51190a55e4..546bca95ca40 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -255,6 +255,19 @@ enum node_stat_item { PGDEMOTE_DIRECT, PGDEMOTE_KHUGEPAGED, PGDEMOTE_PROACTIVE, + PGSTEAL_KSWAPD, + PGSTEAL_DIRECT, + PGSTEAL_KHUGEPAGED, + PGSTEAL_PROACTIVE, + PGSTEAL_ANON, + PGSTEAL_FILE, + PGSCAN_KSWAPD, + PGSCAN_DIRECT, + PGSCAN_KHUGEPAGED, + PGSCAN_PROACTIVE, + PGSCAN_ANON, + PGSCAN_FILE, + PGREFILL, #ifdef CONFIG_HUGETLB_PAGE NR_HUGETLB, #endif diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 22a139f82d75..03fe95f5a020 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -38,21 +38,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGFREE, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE, PGFAULT, PGMAJFAULT, PGLAZYFREED, - PGREFILL, PGREUSE, - PGSTEAL_KSWAPD, - PGSTEAL_DIRECT, - PGSTEAL_KHUGEPAGED, - PGSTEAL_PROACTIVE, - PGSCAN_KSWAPD, - PGSCAN_DIRECT, - PGSCAN_KHUGEPAGED, - PGSCAN_PROACTIVE, PGSCAN_DIRECT_THROTTLE, - PGSCAN_ANON, - PGSCAN_FILE, - PGSTEAL_ANON, - PGSTEAL_FILE, #ifdef CONFIG_NUMA PGSCAN_ZONE_RECLAIM_SUCCESS, PGSCAN_ZONE_RECLAIM_FAILED, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 772bac21d155..af75f10150a8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -330,6 +330,19 @@ static const unsigned int memcg_node_stat_items[] = { PGDEMOTE_DIRECT, PGDEMOTE_KHUGEPAGED, PGDEMOTE_PROACTIVE, + PGSTEAL_KSWAPD, + PGSTEAL_DIRECT, + PGSTEAL_KHUGEPAGED, + PGSTEAL_PROACTIVE, + PGSTEAL_ANON, + PGSTEAL_FILE, + PGSCAN_KSWAPD, + PGSCAN_DIRECT, + PGSCAN_KHUGEPAGED, + PGSCAN_PROACTIVE, + PGSCAN_ANON, + PGSCAN_FILE, + PGREFILL, #ifdef CONFIG_HUGETLB_PAGE NR_HUGETLB, #endif @@ -443,17 +456,8 @@ static const unsigned int memcg_vm_event_stat[] = { #endif PSWPIN, PSWPOUT, - PGSCAN_KSWAPD, - PGSCAN_DIRECT, - PGSCAN_KHUGEPAGED, - PGSCAN_PROACTIVE, - PGSTEAL_KSWAPD, - PGSTEAL_DIRECT, - PGSTEAL_KHUGEPAGED, - PGSTEAL_PROACTIVE, PGFAULT, PGMAJFAULT, - PGREFILL, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE, @@ -1400,6 +1404,15 @@ static const struct memory_stat memory_stats[] = { { "pgdemote_direct", PGDEMOTE_DIRECT }, { "pgdemote_khugepaged", PGDEMOTE_KHUGEPAGED }, { "pgdemote_proactive", PGDEMOTE_PROACTIVE }, + { "pgsteal_kswapd", PGSTEAL_KSWAPD }, + { "pgsteal_direct", PGSTEAL_DIRECT }, + { "pgsteal_khugepaged", PGSTEAL_KHUGEPAGED }, + { "pgsteal_proactive", PGSTEAL_PROACTIVE }, + { "pgscan_kswapd", PGSCAN_KSWAPD }, + { "pgscan_direct", PGSCAN_DIRECT }, + { "pgscan_khugepaged", PGSCAN_KHUGEPAGED }, + { "pgscan_proactive", PGSCAN_PROACTIVE }, + { "pgrefill", PGREFILL }, #ifdef CONFIG_NUMA_BALANCING { "pgpromote_success", PGPROMOTE_SUCCESS }, #endif @@ -1443,6 +1456,15 @@ static int memcg_page_state_output_unit(int item) case PGDEMOTE_DIRECT: case PGDEMOTE_KHUGEPAGED: case PGDEMOTE_PROACTIVE: + case PGSTEAL_KSWAPD: + case PGSTEAL_DIRECT: + case PGSTEAL_KHUGEPAGED: + case PGSTEAL_PROACTIVE: + case PGSCAN_KSWAPD: + case PGSCAN_DIRECT: + case PGSCAN_KHUGEPAGED: + case PGSCAN_PROACTIVE: + case PGREFILL: #ifdef CONFIG_NUMA_BALANCING case PGPROMOTE_SUCCESS: #endif @@ -1514,15 +1536,15 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) /* Accumulated memory events */ seq_buf_printf(s, "pgscan %lu\n", - memcg_events(memcg, PGSCAN_KSWAPD) + - memcg_events(memcg, PGSCAN_DIRECT) + - memcg_events(memcg, PGSCAN_PROACTIVE) + - memcg_events(memcg, PGSCAN_KHUGEPAGED)); + memcg_page_state(memcg, PGSCAN_KSWAPD) + + memcg_page_state(memcg, PGSCAN_DIRECT) + + memcg_page_state(memcg, PGSCAN_PROACTIVE) + + memcg_page_state(memcg, PGSCAN_KHUGEPAGED)); seq_buf_printf(s, "pgsteal %lu\n", - memcg_events(memcg, PGSTEAL_KSWAPD) + - memcg_events(memcg, PGSTEAL_DIRECT) + - memcg_events(memcg, PGSTEAL_PROACTIVE) + - memcg_events(memcg, PGSTEAL_KHUGEPAGED)); + memcg_page_state(memcg, PGSTEAL_KSWAPD) + + memcg_page_state(memcg, PGSTEAL_DIRECT) + + memcg_page_state(memcg, PGSTEAL_PROACTIVE) + + memcg_page_state(memcg, PGSTEAL_KHUGEPAGED)); for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) { #ifdef CONFIG_MEMCG_V1 diff --git a/mm/vmscan.c b/mm/vmscan.c index 0fc9373e8251..031c5c035a82 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1984,7 +1984,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, unsigned long nr_taken; struct reclaim_stat stat; bool file = is_file_lru(lru); - enum vm_event_item item; + enum node_stat_item item; struct pglist_data *pgdat = lruvec_pgdat(lruvec); bool stalled = false; @@ -2010,10 +2010,8 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); item = PGSCAN_KSWAPD + reclaimer_offset(sc); - if (!cgroup_reclaim(sc)) - __count_vm_events(item, nr_scanned); - count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); - __count_vm_events(PGSCAN_ANON + file, nr_scanned); + mod_lruvec_state(lruvec, item, nr_scanned); + mod_lruvec_state(lruvec, PGSCAN_ANON + file, nr_scanned); spin_unlock_irq(&lruvec->lru_lock); @@ -2030,10 +2028,8 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, stat.nr_demoted); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); item = PGSTEAL_KSWAPD + reclaimer_offset(sc); - if (!cgroup_reclaim(sc)) - __count_vm_events(item, nr_reclaimed); - count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); - __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); + mod_lruvec_state(lruvec, item, nr_reclaimed); + mod_lruvec_state(lruvec, PGSTEAL_ANON + file, nr_reclaimed); lru_note_cost_unlock_irq(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); @@ -2120,9 +2116,7 @@ static void shrink_active_list(unsigned long nr_to_scan, __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); - if (!cgroup_reclaim(sc)) - __count_vm_events(PGREFILL, nr_scanned); - count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); + mod_lruvec_state(lruvec, PGREFILL, nr_scanned); spin_unlock_irq(&lruvec->lru_lock); @@ -4543,7 +4537,7 @@ static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec, { int i; int gen; - enum vm_event_item item; + enum node_stat_item item; int sorted = 0; int scanned = 0; int isolated = 0; @@ -4551,7 +4545,6 @@ static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec, int scan_batch = min(nr_to_scan, MAX_LRU_BATCH); int remaining = scan_batch; struct lru_gen_folio *lrugen = &lruvec->lrugen; - struct mem_cgroup *memcg = lruvec_memcg(lruvec); VM_WARN_ON_ONCE(!list_empty(list)); @@ -4602,13 +4595,9 @@ static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec, } item = PGSCAN_KSWAPD + reclaimer_offset(sc); - if (!cgroup_reclaim(sc)) { - __count_vm_events(item, isolated); - __count_vm_events(PGREFILL, sorted); - } - count_memcg_events(memcg, item, isolated); - count_memcg_events(memcg, PGREFILL, sorted); - __count_vm_events(PGSCAN_ANON + type, isolated); + mod_lruvec_state(lruvec, item, isolated); + mod_lruvec_state(lruvec, PGREFILL, sorted); + mod_lruvec_state(lruvec, PGSCAN_ANON + type, isolated); trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, scan_batch, scanned, skipped, isolated, type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); @@ -4693,7 +4682,7 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec, LIST_HEAD(clean); struct folio *folio; struct folio *next; - enum vm_event_item item; + enum node_stat_item item; struct reclaim_stat stat; struct lru_gen_mm_walk *walk; bool skip_retry = false; @@ -4757,10 +4746,8 @@ retry: stat.nr_demoted); item = PGSTEAL_KSWAPD + reclaimer_offset(sc); - if (!cgroup_reclaim(sc)) - __count_vm_events(item, reclaimed); - count_memcg_events(memcg, item, reclaimed); - __count_vm_events(PGSTEAL_ANON + type, reclaimed); + mod_lruvec_state(lruvec, item, reclaimed); + mod_lruvec_state(lruvec, PGSTEAL_ANON + type, reclaimed); spin_unlock_irq(&lruvec->lru_lock); diff --git a/mm/vmstat.c b/mm/vmstat.c index 86b14b0f77b5..44bbb7752f11 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1276,6 +1276,19 @@ const char * const vmstat_text[] = { [I(PGDEMOTE_DIRECT)] = "pgdemote_direct", [I(PGDEMOTE_KHUGEPAGED)] = "pgdemote_khugepaged", [I(PGDEMOTE_PROACTIVE)] = "pgdemote_proactive", + [I(PGSTEAL_KSWAPD)] = "pgsteal_kswapd", + [I(PGSTEAL_DIRECT)] = "pgsteal_direct", + [I(PGSTEAL_KHUGEPAGED)] = "pgsteal_khugepaged", + [I(PGSTEAL_PROACTIVE)] = "pgsteal_proactive", + [I(PGSTEAL_ANON)] = "pgsteal_anon", + [I(PGSTEAL_FILE)] = "pgsteal_file", + [I(PGSCAN_KSWAPD)] = "pgscan_kswapd", + [I(PGSCAN_DIRECT)] = "pgscan_direct", + [I(PGSCAN_KHUGEPAGED)] = "pgscan_khugepaged", + [I(PGSCAN_PROACTIVE)] = "pgscan_proactive", + [I(PGSCAN_ANON)] = "pgscan_anon", + [I(PGSCAN_FILE)] = "pgscan_file", + [I(PGREFILL)] = "pgrefill", #ifdef CONFIG_HUGETLB_PAGE [I(NR_HUGETLB)] = "nr_hugetlb", #endif @@ -1318,21 +1331,8 @@ const char * const vmstat_text[] = { [I(PGMAJFAULT)] = "pgmajfault", [I(PGLAZYFREED)] = "pglazyfreed", - [I(PGREFILL)] = "pgrefill", [I(PGREUSE)] = "pgreuse", - [I(PGSTEAL_KSWAPD)] = "pgsteal_kswapd", - [I(PGSTEAL_DIRECT)] = "pgsteal_direct", - [I(PGSTEAL_KHUGEPAGED)] = "pgsteal_khugepaged", - [I(PGSTEAL_PROACTIVE)] = "pgsteal_proactive", - [I(PGSCAN_KSWAPD)] = "pgscan_kswapd", - [I(PGSCAN_DIRECT)] = "pgscan_direct", - [I(PGSCAN_KHUGEPAGED)] = "pgscan_khugepaged", - [I(PGSCAN_PROACTIVE)] = "pgscan_proactive", [I(PGSCAN_DIRECT_THROTTLE)] = "pgscan_direct_throttle", - [I(PGSCAN_ANON)] = "pgscan_anon", - [I(PGSCAN_FILE)] = "pgscan_file", - [I(PGSTEAL_ANON)] = "pgsteal_anon", - [I(PGSTEAL_FILE)] = "pgsteal_file", #ifdef CONFIG_NUMA [I(PGSCAN_ZONE_RECLAIM_SUCCESS)] = "zone_reclaim_success", -- cgit v1.2.3 From 0d6af9bcf383bcdf601e670bb605861b01e318e7 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 18 Feb 2026 04:06:34 +0800 Subject: mm, swap: use the swap table to track the swap count Now all the infrastructures are ready, switch to using the swap table only. This is unfortunately a large patch because the whole old counting mechanism, especially SWP_CONTINUED, has to be gone and switch to the new mechanism together, with no intermediate steps available. The swap table is capable of holding up to SWP_TB_COUNT_MAX - 1 counts in the higher bits of each table entry, so using that, the swap_map can be completely dropped. swap_map also had a limit of SWAP_CONT_MAX. Any value beyond that limit will require a COUNT_CONTINUED page. COUNT_CONTINUED is a bit complex to maintain, so for the swap table, a simpler approach is used: when the count goes beyond SWP_TB_COUNT_MAX - 1, the cluster will have an extend_table allocated, which is a swap cluster-sized array of unsigned int. The counting is basically offloaded there until the count drops below SWP_TB_COUNT_MAX again. Both the swap table and the extend table are cluster-based, so they exhibit good performance and sparsity. To make the switch from swap_map to swap table clean, this commit cleans up and introduces a new set of functions based on the swap table design, for manipulating swap counts: - __swap_cluster_dup_entry, __swap_cluster_put_entry, __swap_cluster_alloc_entry, __swap_cluster_free_entry: Increase/decrease the count of a swap slot, or alloc / free a swap slot. This is the internal routine that does the counting work based on the swap table and handles all the complexities. The caller will need to lock the cluster before calling them. All swap count-related update operations are wrapped by these four helpers. - swap_dup_entries_cluster, swap_put_entries_cluster: Increase/decrease the swap count of one or a set of swap slots in the same cluster range. These two helpers serve as the common routines for folio_dup_swap & swap_dup_entry_direct, or folio_put_swap & swap_put_entries_direct. And use these helpers to replace all existing callers. This helps to simplify the count tracking by a lot, and the swap_map is gone. [ryncsn@gmail.com: fix build] Link: https://lkml.kernel.org/r/aZWuLZi-vYi3vAWe@KASONG-MC4 Link: https://lkml.kernel.org/r/20260218-swap-table-p3-v3-9-f4e34be021a7@tencent.com Signed-off-by: Kairui Song Suggested-by: Chris Li Acked-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: David Hildenbrand Cc: Johannes Weiner Cc: Kairui Song Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Nhat Pham Signed-off-by: Andrew Morton --- include/linux/swap.h | 28 +- mm/memory.c | 2 +- mm/swap.h | 14 +- mm/swap_state.c | 53 ++-- mm/swap_table.h | 5 + mm/swapfile.c | 790 +++++++++++++++++++-------------------------------- 6 files changed, 334 insertions(+), 558 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 62fc7499b408..0effe3cc50f5 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -208,7 +208,6 @@ enum { SWP_DISCARDABLE = (1 << 2), /* blkdev support discard */ SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */ SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ - SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */ SWP_BLKDEV = (1 << 6), /* its a block device */ SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */ SWP_FS_OPS = (1 << 8), /* swapfile operations go through fs */ @@ -223,16 +222,6 @@ enum { #define SWAP_CLUSTER_MAX_SKIPPED (SWAP_CLUSTER_MAX << 10) #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX -/* Bit flag in swap_map */ -#define COUNT_CONTINUED 0x80 /* Flag swap_map continuation for full count */ - -/* Special value in first swap_map */ -#define SWAP_MAP_MAX 0x3e /* Max count */ -#define SWAP_MAP_BAD 0x3f /* Note page is bad */ - -/* Special value in each swap_map continuation */ -#define SWAP_CONT_MAX 0x7f /* Max count */ - /* * The first page in the swap file is the swap header, which is always marked * bad to prevent it from being allocated as an entry. This also prevents the @@ -264,8 +253,7 @@ struct swap_info_struct { signed short prio; /* swap priority of this type */ struct plist_node list; /* entry in swap_active_head */ signed char type; /* strange name for an index */ - unsigned int max; /* extent of the swap_map */ - unsigned char *swap_map; /* vmalloc'ed array of usage counts */ + unsigned int max; /* size of this swap device */ unsigned long *zeromap; /* kvmalloc'ed bitmap to track zero pages */ struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ struct list_head free_clusters; /* free clusters list */ @@ -284,18 +272,14 @@ struct swap_info_struct { struct completion comp; /* seldom referenced */ spinlock_t lock; /* * protect map scan related fields like - * swap_map, inuse_pages and all cluster - * lists. other fields are only changed + * inuse_pages and all cluster lists. + * Other fields are only changed * at swapon/swapoff, so are protected * by swap_lock. changing flags need * hold this lock and swap_lock. If * both locks need hold, hold swap_lock * first. */ - spinlock_t cont_lock; /* - * protect swap count continuation page - * list. - */ struct work_struct discard_work; /* discard worker */ struct work_struct reclaim_work; /* reclaim worker */ struct list_head discard_clusters; /* discard clusters list */ @@ -451,7 +435,6 @@ static inline long get_nr_swap_pages(void) } extern void si_swapinfo(struct sysinfo *); -extern int add_swap_count_continuation(swp_entry_t, gfp_t); int swap_type_of(dev_t device, sector_t offset); int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); @@ -517,11 +500,6 @@ static inline void free_swap_cache(struct folio *folio) { } -static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) -{ - return 0; -} - static inline int swap_dup_entry_direct(swp_entry_t ent) { return 0; diff --git a/mm/memory.c b/mm/memory.c index 2f815a34d924..7084c426f933 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1346,7 +1346,7 @@ again: if (ret == -EIO) { VM_WARN_ON_ONCE(!entry.val); - if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) { + if (swap_retry_table_alloc(entry, GFP_KERNEL) < 0) { ret = -ENOMEM; goto out; } diff --git a/mm/swap.h b/mm/swap.h index bfafa637c458..0a91e21e92b1 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -37,6 +37,7 @@ struct swap_cluster_info { u8 flags; u8 order; atomic_long_t __rcu *table; /* Swap table entries, see mm/swap_table.h */ + unsigned int *extend_table; /* For large swap count, protected by ci->lock */ struct list_head list; }; @@ -183,6 +184,8 @@ static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci) spin_unlock_irq(&ci->lock); } +extern int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp); + /* * Below are the core routines for doing swap for a folio. * All helpers requires the folio to be locked, and a locked folio @@ -206,9 +209,9 @@ int folio_dup_swap(struct folio *folio, struct page *subpage); void folio_put_swap(struct folio *folio, struct page *subpage); /* For internal use */ -extern void swap_entries_free(struct swap_info_struct *si, - struct swap_cluster_info *ci, - unsigned long offset, unsigned int nr_pages); +extern void __swap_cluster_free_entries(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned int ci_off, unsigned int nr_pages); /* linux/mm/page_io.c */ int sio_pool_init(void); @@ -446,6 +449,11 @@ static inline int swap_writeout(struct folio *folio, return 0; } +static inline int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp) +{ + return -EINVAL; +} + static inline bool swap_cache_has_folio(swp_entry_t entry) { return false; diff --git a/mm/swap_state.c b/mm/swap_state.c index e213ee35c1d2..e7618ffe6d70 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -140,21 +140,20 @@ void *swap_cache_get_shadow(swp_entry_t entry) void __swap_cache_add_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry) { - unsigned long new_tb; - unsigned int ci_start, ci_off, ci_end; + unsigned int ci_off = swp_cluster_offset(entry), ci_end; unsigned long nr_pages = folio_nr_pages(folio); + unsigned long pfn = folio_pfn(folio); + unsigned long old_tb; VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio); - new_tb = folio_to_swp_tb(folio, 0); - ci_start = swp_cluster_offset(entry); - ci_off = ci_start; - ci_end = ci_start + nr_pages; + ci_end = ci_off + nr_pages; do { - VM_WARN_ON_ONCE(swp_tb_is_folio(__swap_table_get(ci, ci_off))); - __swap_table_set(ci, ci_off, new_tb); + old_tb = __swap_table_get(ci, ci_off); + VM_WARN_ON_ONCE(swp_tb_is_folio(old_tb)); + __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_count(old_tb))); } while (++ci_off < ci_end); folio_ref_add(folio, nr_pages); @@ -183,14 +182,13 @@ static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, unsigned long old_tb; struct swap_info_struct *si; struct swap_cluster_info *ci; - unsigned int ci_start, ci_off, ci_end, offset; + unsigned int ci_start, ci_off, ci_end; unsigned long nr_pages = folio_nr_pages(folio); si = __swap_entry_to_info(entry); ci_start = swp_cluster_offset(entry); ci_end = ci_start + nr_pages; ci_off = ci_start; - offset = swp_offset(entry); ci = swap_cluster_lock(si, swp_offset(entry)); if (unlikely(!ci->table)) { err = -ENOENT; @@ -202,13 +200,12 @@ static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, err = -EEXIST; goto failed; } - if (unlikely(!__swap_count(swp_entry(swp_type(entry), offset)))) { + if (unlikely(!__swp_tb_get_count(old_tb))) { err = -ENOENT; goto failed; } if (swp_tb_is_shadow(old_tb)) shadow = swp_tb_to_shadow(old_tb); - offset++; } while (++ci_off < ci_end); __swap_cache_add_folio(ci, folio, entry); swap_cluster_unlock(ci); @@ -237,8 +234,9 @@ failed: void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry, void *shadow) { + int count; + unsigned long old_tb; struct swap_info_struct *si; - unsigned long old_tb, new_tb; unsigned int ci_start, ci_off, ci_end; bool folio_swapped = false, need_free = false; unsigned long nr_pages = folio_nr_pages(folio); @@ -249,20 +247,20 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio); si = __swap_entry_to_info(entry); - new_tb = shadow_to_swp_tb(shadow, 0); ci_start = swp_cluster_offset(entry); ci_end = ci_start + nr_pages; ci_off = ci_start; do { - /* If shadow is NULL, we sets an empty shadow */ - old_tb = __swap_table_xchg(ci, ci_off, new_tb); + old_tb = __swap_table_get(ci, ci_off); WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != folio); - if (__swap_count(swp_entry(si->type, - swp_offset(entry) + ci_off - ci_start))) + count = __swp_tb_get_count(old_tb); + if (count) folio_swapped = true; else need_free = true; + /* If shadow is NULL, we sets an empty shadow. */ + __swap_table_set(ci, ci_off, shadow_to_swp_tb(shadow, count)); } while (++ci_off < ci_end); folio->swap.val = 0; @@ -271,13 +269,13 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages); if (!folio_swapped) { - swap_entries_free(si, ci, swp_offset(entry), nr_pages); + __swap_cluster_free_entries(si, ci, ci_start, nr_pages); } else if (need_free) { + ci_off = ci_start; do { - if (!__swap_count(entry)) - swap_entries_free(si, ci, swp_offset(entry), 1); - entry.val++; - } while (--nr_pages); + if (!__swp_tb_get_count(__swap_table_get(ci, ci_off))) + __swap_cluster_free_entries(si, ci, ci_off, 1); + } while (++ci_off < ci_end); } } @@ -324,17 +322,18 @@ void __swap_cache_replace_folio(struct swap_cluster_info *ci, unsigned long nr_pages = folio_nr_pages(new); unsigned int ci_off = swp_cluster_offset(entry); unsigned int ci_end = ci_off + nr_pages; - unsigned long old_tb, new_tb; + unsigned long pfn = folio_pfn(new); + unsigned long old_tb; VM_WARN_ON_ONCE(!folio_test_swapcache(old) || !folio_test_swapcache(new)); VM_WARN_ON_ONCE(!folio_test_locked(old) || !folio_test_locked(new)); VM_WARN_ON_ONCE(!entry.val); /* Swap cache still stores N entries instead of a high-order entry */ - new_tb = folio_to_swp_tb(new, 0); do { - old_tb = __swap_table_xchg(ci, ci_off, new_tb); + old_tb = __swap_table_get(ci, ci_off); WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != old); + __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_count(old_tb))); } while (++ci_off < ci_end); /* @@ -368,7 +367,7 @@ void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents) ci_end = ci_off + nr_ents; do { old = __swap_table_xchg(ci, ci_off, null_to_swp_tb()); - WARN_ON_ONCE(swp_tb_is_folio(old)); + WARN_ON_ONCE(swp_tb_is_folio(old) || swp_tb_get_count(old)); } while (++ci_off < ci_end); } diff --git a/mm/swap_table.h b/mm/swap_table.h index 10762ac5f4f5..8415ffbe2b9c 100644 --- a/mm/swap_table.h +++ b/mm/swap_table.h @@ -191,6 +191,11 @@ static inline int swp_tb_get_count(unsigned long swp_tb) return -EINVAL; } +static inline unsigned long __swp_tb_mk_count(unsigned long swp_tb, int count) +{ + return ((swp_tb & ~SWP_TB_COUNT_MASK) | __count_to_swp_tb(count)); +} + /* * Helpers for accessing or modifying the swap table of a cluster, * the swap cluster must be locked. diff --git a/mm/swapfile.c b/mm/swapfile.c index 54a19ebce540..cf976ecae8a8 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -51,15 +51,8 @@ #include "swap_table.h" #include "swap.h" -static bool swap_count_continued(struct swap_info_struct *, pgoff_t, - unsigned char); -static void free_swap_count_continuations(struct swap_info_struct *); static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); -static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr); -static void swap_put_entry_locked(struct swap_info_struct *si, - struct swap_cluster_info *ci, - unsigned long offset); static bool folio_swapcache_freeable(struct folio *folio); static void move_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci, struct list_head *list, @@ -182,22 +175,19 @@ static long swap_usage_in_pages(struct swap_info_struct *si) /* Reclaim the swap entry if swap is getting full */ #define TTRS_FULL 0x4 -static bool swap_only_has_cache(struct swap_info_struct *si, - struct swap_cluster_info *ci, +static bool swap_only_has_cache(struct swap_cluster_info *ci, unsigned long offset, int nr_pages) { unsigned int ci_off = offset % SWAPFILE_CLUSTER; - unsigned char *map = si->swap_map + offset; - unsigned char *map_end = map + nr_pages; + unsigned int ci_end = ci_off + nr_pages; unsigned long swp_tb; do { swp_tb = __swap_table_get(ci, ci_off); VM_WARN_ON_ONCE(!swp_tb_is_folio(swp_tb)); - if (*map) + if (swp_tb_get_count(swp_tb)) return false; - ++ci_off; - } while (++map < map_end); + } while (++ci_off < ci_end); return true; } @@ -256,7 +246,7 @@ again: * reference or pending writeback, and can't be allocated to others. */ ci = swap_cluster_lock(si, offset); - need_reclaim = swap_only_has_cache(si, ci, offset, nr_pages); + need_reclaim = swap_only_has_cache(ci, offset, nr_pages); swap_cluster_unlock(ci); if (!need_reclaim) goto out_unlock; @@ -479,6 +469,7 @@ static void swap_cluster_assert_empty(struct swap_cluster_info *ci, } while (++ci_off < ci_end); WARN_ON_ONCE(bad_slots != (swapoff ? ci->count : 0)); + WARN_ON_ONCE(nr == SWAPFILE_CLUSTER && ci->extend_table); } static void swap_cluster_free_table(struct swap_cluster_info *ci) @@ -807,7 +798,6 @@ static int swap_cluster_setup_bad_slot(struct swap_info_struct *si, pr_warn("Duplicated bad slot offset %d\n", offset); ret = -EINVAL; } else { - si->swap_map[offset] = SWAP_MAP_BAD; ci->count++; } spin_unlock(&ci->lock); @@ -829,18 +819,16 @@ static bool cluster_reclaim_range(struct swap_info_struct *si, { unsigned int nr_pages = 1 << order; unsigned long offset = start, end = start + nr_pages; - unsigned char *map = si->swap_map; unsigned long swp_tb; spin_unlock(&ci->lock); do { - if (READ_ONCE(map[offset])) - break; swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER); - if (swp_tb_is_folio(swp_tb)) { + if (swp_tb_get_count(swp_tb)) + break; + if (swp_tb_is_folio(swp_tb)) if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY) < 0) break; - } } while (++offset < end); spin_lock(&ci->lock); @@ -864,7 +852,7 @@ static bool cluster_reclaim_range(struct swap_info_struct *si, */ for (offset = start; offset < end; offset++) { swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER); - if (map[offset] || !swp_tb_is_null(swp_tb)) + if (!swp_tb_is_null(swp_tb)) return false; } @@ -876,37 +864,35 @@ static bool cluster_scan_range(struct swap_info_struct *si, unsigned long offset, unsigned int nr_pages, bool *need_reclaim) { - unsigned long end = offset + nr_pages; - unsigned char *map = si->swap_map; + unsigned int ci_off = offset % SWAPFILE_CLUSTER; + unsigned int ci_end = ci_off + nr_pages; unsigned long swp_tb; - if (cluster_is_empty(ci)) - return true; - do { - if (map[offset]) - return false; - swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER); - if (swp_tb_is_folio(swp_tb)) { + swp_tb = __swap_table_get(ci, ci_off); + if (swp_tb_is_null(swp_tb)) + continue; + if (swp_tb_is_folio(swp_tb) && !__swp_tb_get_count(swp_tb)) { if (!vm_swap_full()) return false; *need_reclaim = true; - } else { - /* A entry with no count and no cache must be null */ - VM_WARN_ON_ONCE(!swp_tb_is_null(swp_tb)); + continue; } - } while (++offset < end); + /* Slot with zero count can only be NULL or folio */ + VM_WARN_ON(!swp_tb_get_count(swp_tb)); + return false; + } while (++ci_off < ci_end); return true; } -static bool cluster_alloc_range(struct swap_info_struct *si, - struct swap_cluster_info *ci, - struct folio *folio, - unsigned int offset) +static bool __swap_cluster_alloc_entries(struct swap_info_struct *si, + struct swap_cluster_info *ci, + struct folio *folio, + unsigned int ci_off) { - unsigned long nr_pages; unsigned int order; + unsigned long nr_pages; lockdep_assert_held(&ci->lock); @@ -925,14 +911,15 @@ static bool cluster_alloc_range(struct swap_info_struct *si, if (likely(folio)) { order = folio_order(folio); nr_pages = 1 << order; - swap_cluster_assert_empty(ci, offset % SWAPFILE_CLUSTER, nr_pages, false); - __swap_cache_add_folio(ci, folio, swp_entry(si->type, offset)); + swap_cluster_assert_empty(ci, ci_off, nr_pages, false); + __swap_cache_add_folio(ci, folio, swp_entry(si->type, + ci_off + cluster_offset(si, ci))); } else if (IS_ENABLED(CONFIG_HIBERNATION)) { order = 0; nr_pages = 1; - WARN_ON_ONCE(si->swap_map[offset]); - si->swap_map[offset] = 1; - swap_cluster_assert_empty(ci, offset % SWAPFILE_CLUSTER, 1, false); + swap_cluster_assert_empty(ci, ci_off, 1, false); + /* Sets a fake shadow as placeholder */ + __swap_table_set(ci, ci_off, shadow_to_swp_tb(NULL, 1)); } else { /* Allocation without folio is only possible with hibernation */ WARN_ON_ONCE(1); @@ -983,7 +970,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, if (!ret) continue; } - if (!cluster_alloc_range(si, ci, folio, offset)) + if (!__swap_cluster_alloc_entries(si, ci, folio, offset % SWAPFILE_CLUSTER)) break; found = offset; offset += nr_pages; @@ -1030,7 +1017,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) long to_scan = 1; unsigned long offset, end; struct swap_cluster_info *ci; - unsigned char *map = si->swap_map; + unsigned long swp_tb; int nr_reclaim; if (force) @@ -1042,8 +1029,8 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) to_scan--; while (offset < end) { - if (!READ_ONCE(map[offset]) && - swp_tb_is_folio(swap_table_get(ci, offset % SWAPFILE_CLUSTER))) { + swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER); + if (swp_tb_is_folio(swp_tb) && !__swp_tb_get_count(swp_tb)) { spin_unlock(&ci->lock); nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); @@ -1452,40 +1439,127 @@ start_over: return false; } +static int swap_extend_table_alloc(struct swap_info_struct *si, + struct swap_cluster_info *ci, gfp_t gfp) +{ + void *table; + + table = kzalloc(sizeof(ci->extend_table[0]) * SWAPFILE_CLUSTER, gfp); + if (!table) + return -ENOMEM; + + spin_lock(&ci->lock); + if (!ci->extend_table) + ci->extend_table = table; + else + kfree(table); + spin_unlock(&ci->lock); + return 0; +} + +int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp) +{ + int ret; + struct swap_info_struct *si; + struct swap_cluster_info *ci; + unsigned long offset = swp_offset(entry); + + si = get_swap_device(entry); + if (!si) + return 0; + + ci = __swap_offset_to_cluster(si, offset); + ret = swap_extend_table_alloc(si, ci, gfp); + + put_swap_device(si); + return ret; +} + +static void swap_extend_table_try_free(struct swap_cluster_info *ci) +{ + unsigned long i; + bool can_free = true; + + if (!ci->extend_table) + return; + + for (i = 0; i < SWAPFILE_CLUSTER; i++) { + if (ci->extend_table[i]) + can_free = false; + } + + if (can_free) { + kfree(ci->extend_table); + ci->extend_table = NULL; + } +} + +/* Decrease the swap count of one slot, without freeing it */ +static void __swap_cluster_put_entry(struct swap_cluster_info *ci, + unsigned int ci_off) +{ + int count; + unsigned long swp_tb; + + lockdep_assert_held(&ci->lock); + swp_tb = __swap_table_get(ci, ci_off); + count = __swp_tb_get_count(swp_tb); + + VM_WARN_ON_ONCE(count <= 0); + VM_WARN_ON_ONCE(count > SWP_TB_COUNT_MAX); + + if (count == SWP_TB_COUNT_MAX) { + count = ci->extend_table[ci_off]; + /* Overflow starts with SWP_TB_COUNT_MAX */ + VM_WARN_ON_ONCE(count < SWP_TB_COUNT_MAX); + count--; + if (count == (SWP_TB_COUNT_MAX - 1)) { + ci->extend_table[ci_off] = 0; + __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, count)); + swap_extend_table_try_free(ci); + } else { + ci->extend_table[ci_off] = count; + } + } else { + __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, --count)); + } +} + /** - * swap_put_entries_cluster - Decrease the swap count of a set of slots. + * swap_put_entries_cluster - Decrease the swap count of slots within one cluster * @si: The swap device. - * @start: start offset of slots. + * @offset: start offset of slots. * @nr: number of slots. - * @reclaim_cache: if true, also reclaim the swap cache. + * @reclaim_cache: if true, also reclaim the swap cache if slots are freed. * * This helper decreases the swap count of a set of slots and tries to * batch free them. Also reclaims the swap cache if @reclaim_cache is true. - * Context: The caller must ensure that all slots belong to the same - * cluster and their swap count doesn't go underflow. + * + * Context: The specified slots must be pinned by existing swap count or swap + * cache reference, so they won't be released until this helper returns. */ static void swap_put_entries_cluster(struct swap_info_struct *si, - unsigned long start, int nr, + pgoff_t offset, int nr, bool reclaim_cache) { - unsigned long offset = start, end = start + nr; - unsigned long batch_start = SWAP_ENTRY_INVALID; struct swap_cluster_info *ci; + unsigned int ci_off, ci_end; + pgoff_t end = offset + nr; bool need_reclaim = false; unsigned int nr_reclaimed; unsigned long swp_tb; - unsigned int count; + int ci_batch = -1; ci = swap_cluster_lock(si, offset); + ci_off = offset % SWAPFILE_CLUSTER; + ci_end = ci_off + nr; do { - swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER); - count = si->swap_map[offset]; - VM_WARN_ON(count < 1 || count == SWAP_MAP_BAD); - if (count == 1) { + swp_tb = __swap_table_get(ci, ci_off); + if (swp_tb_get_count(swp_tb) == 1) { /* count == 1 and non-cached slots will be batch freed. */ if (!swp_tb_is_folio(swp_tb)) { - if (!batch_start) - batch_start = offset; + if (ci_batch == -1) + ci_batch = ci_off; continue; } /* count will be 0 after put, slot can be reclaimed */ @@ -1497,21 +1571,20 @@ static void swap_put_entries_cluster(struct swap_info_struct *si, * slots will be freed when folio is removed from swap cache * (__swap_cache_del_folio). */ - swap_put_entry_locked(si, ci, offset); - if (batch_start) { - swap_entries_free(si, ci, batch_start, offset - batch_start); - batch_start = SWAP_ENTRY_INVALID; + __swap_cluster_put_entry(ci, ci_off); + if (ci_batch != -1) { + __swap_cluster_free_entries(si, ci, ci_batch, ci_off - ci_batch); + ci_batch = -1; } - } while (++offset < end); + } while (++ci_off < ci_end); - if (batch_start) - swap_entries_free(si, ci, batch_start, offset - batch_start); + if (ci_batch != -1) + __swap_cluster_free_entries(si, ci, ci_batch, ci_off - ci_batch); swap_cluster_unlock(ci); if (!need_reclaim || !reclaim_cache) return; - offset = start; do { nr_reclaimed = __try_to_reclaim_swap(si, offset, TTRS_UNMAPPED | TTRS_FULL); @@ -1521,6 +1594,92 @@ static void swap_put_entries_cluster(struct swap_info_struct *si, } while (offset < end); } +/* Increase the swap count of one slot. */ +static int __swap_cluster_dup_entry(struct swap_cluster_info *ci, + unsigned int ci_off) +{ + int count; + unsigned long swp_tb; + + lockdep_assert_held(&ci->lock); + swp_tb = __swap_table_get(ci, ci_off); + /* Bad or special slots can't be handled */ + if (WARN_ON_ONCE(swp_tb_is_bad(swp_tb))) + return -EINVAL; + count = __swp_tb_get_count(swp_tb); + /* Must be either cached or have a count already */ + if (WARN_ON_ONCE(!count && !swp_tb_is_folio(swp_tb))) + return -ENOENT; + + if (likely(count < (SWP_TB_COUNT_MAX - 1))) { + __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, count + 1)); + VM_WARN_ON_ONCE(ci->extend_table && ci->extend_table[ci_off]); + } else if (count == (SWP_TB_COUNT_MAX - 1)) { + if (ci->extend_table) { + VM_WARN_ON_ONCE(ci->extend_table[ci_off]); + ci->extend_table[ci_off] = SWP_TB_COUNT_MAX; + __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, SWP_TB_COUNT_MAX)); + } else { + return -ENOMEM; + } + } else if (count == SWP_TB_COUNT_MAX) { + VM_WARN_ON_ONCE(ci->extend_table[ci_off] >= + type_max(typeof(ci->extend_table[0]))); + ++ci->extend_table[ci_off]; + } else { + /* Never happens unless counting went wrong */ + WARN_ON_ONCE(1); + } + + return 0; +} + +/** + * swap_dup_entries_cluster: Increase the swap count of slots within one cluster. + * @si: The swap device. + * @offset: start offset of slots. + * @nr: number of slots. + * + * Context: The specified slots must be pinned by existing swap count or swap + * cache reference, so they won't be released until this helper returns. + * Return: 0 on success. -ENOMEM if the swap count maxed out (SWP_TB_COUNT_MAX) + * and failed to allocate an extended table, -EINVAL if any entry is bad entry. + */ +static int swap_dup_entries_cluster(struct swap_info_struct *si, + pgoff_t offset, int nr) +{ + int err; + struct swap_cluster_info *ci; + unsigned int ci_start, ci_off, ci_end; + + ci_start = offset % SWAPFILE_CLUSTER; + ci_end = ci_start + nr; + ci_off = ci_start; + ci = swap_cluster_lock(si, offset); +restart: + do { + err = __swap_cluster_dup_entry(ci, ci_off); + if (unlikely(err)) { + if (err == -ENOMEM) { + spin_unlock(&ci->lock); + err = swap_extend_table_alloc(si, ci, GFP_ATOMIC); + spin_lock(&ci->lock); + if (!err) + goto restart; + } + goto failed; + } + } while (++ci_off < ci_end); + swap_cluster_unlock(ci); + return 0; +failed: + while (ci_off-- > ci_start) + __swap_cluster_put_entry(ci, ci_off); + swap_extend_table_try_free(ci); + swap_cluster_unlock(ci); + return err; +} + /** * folio_alloc_swap - allocate swap space for a folio * @folio: folio we want to move to swap @@ -1589,13 +1748,10 @@ again: * Context: Caller must ensure the folio is locked and in the swap cache. * NOTE: The caller also has to ensure there is no raced call to * swap_put_entries_direct on its swap entry before this helper returns, or - * the swap map may underflow. Currently, we only accept @subpage == NULL - * for shmem due to the limitation of swap continuation: shmem always - * duplicates the swap entry only once, so there is no such issue for it. + * the swap count may underflow. */ int folio_dup_swap(struct folio *folio, struct page *subpage) { - int err = 0; swp_entry_t entry = folio->swap; unsigned long nr_pages = folio_nr_pages(folio); @@ -1607,10 +1763,8 @@ int folio_dup_swap(struct folio *folio, struct page *subpage) nr_pages = 1; } - while (!err && __swap_duplicate(entry, 1, nr_pages) == -ENOMEM) - err = add_swap_count_continuation(entry, GFP_ATOMIC); - - return err; + return swap_dup_entries_cluster(swap_entry_to_info(entry), + swp_offset(entry), nr_pages); } /** @@ -1639,28 +1793,6 @@ void folio_put_swap(struct folio *folio, struct page *subpage) swap_put_entries_cluster(si, swp_offset(entry), nr_pages, false); } -static void swap_put_entry_locked(struct swap_info_struct *si, - struct swap_cluster_info *ci, - unsigned long offset) -{ - unsigned char count; - - count = si->swap_map[offset]; - if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { - if (count == COUNT_CONTINUED) { - if (swap_count_continued(si, offset, count)) - count = SWAP_MAP_MAX | COUNT_CONTINUED; - else - count = SWAP_MAP_MAX; - } else - count--; - } - - WRITE_ONCE(si->swap_map[offset], count); - if (!count && !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER))) - swap_entries_free(si, ci, offset, 1); -} - /* * When we get a swap entry, if there aren't some other ways to * prevent swapoff, such as the folio in swap cache is locked, RCU @@ -1727,31 +1859,30 @@ put_out: } /* - * Drop the last ref of swap entries, caller have to ensure all entries - * belong to the same cgroup and cluster. + * Free a set of swap slots after their swap count dropped to zero, or will be + * zero after putting the last ref (saves one __swap_cluster_put_entry call). */ -void swap_entries_free(struct swap_info_struct *si, - struct swap_cluster_info *ci, - unsigned long offset, unsigned int nr_pages) +void __swap_cluster_free_entries(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned int ci_start, unsigned int nr_pages) { - swp_entry_t entry = swp_entry(si->type, offset); - unsigned char *map = si->swap_map + offset; - unsigned char *map_end = map + nr_pages; + unsigned long old_tb; + unsigned int ci_off = ci_start, ci_end = ci_start + nr_pages; + unsigned long offset = cluster_offset(si, ci) + ci_start; - /* It should never free entries across different clusters */ - VM_BUG_ON(ci != __swap_offset_to_cluster(si, offset + nr_pages - 1)); - VM_BUG_ON(cluster_is_empty(ci)); - VM_BUG_ON(ci->count < nr_pages); + VM_WARN_ON(ci->count < nr_pages); ci->count -= nr_pages; do { - VM_WARN_ON(*map > 1); - *map = 0; - } while (++map < map_end); + old_tb = __swap_table_get(ci, ci_off); + /* Release the last ref, or after swap cache is dropped */ + VM_WARN_ON(!swp_tb_is_shadow(old_tb) || __swp_tb_get_count(old_tb) > 1); + __swap_table_set(ci, ci_off, null_to_swp_tb()); + } while (++ci_off < ci_end); - mem_cgroup_uncharge_swap(entry, nr_pages); + mem_cgroup_uncharge_swap(swp_entry(si->type, offset), nr_pages); swap_range_free(si, offset, nr_pages); - swap_cluster_assert_empty(ci, offset % SWAPFILE_CLUSTER, nr_pages, false); + swap_cluster_assert_empty(ci, ci_start, nr_pages, false); if (!ci->count) free_cluster(si, ci); @@ -1761,10 +1892,10 @@ void swap_entries_free(struct swap_info_struct *si, int __swap_count(swp_entry_t entry) { - struct swap_info_struct *si = __swap_entry_to_info(entry); - pgoff_t offset = swp_offset(entry); + struct swap_cluster_info *ci = __swap_entry_to_cluster(entry); + unsigned int ci_off = swp_cluster_offset(entry); - return si->swap_map[offset]; + return swp_tb_get_count(__swap_table_get(ci, ci_off)); } /** @@ -1776,81 +1907,62 @@ bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry) { pgoff_t offset = swp_offset(entry); struct swap_cluster_info *ci; - int count; + unsigned long swp_tb; ci = swap_cluster_lock(si, offset); - count = si->swap_map[offset]; + swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER); swap_cluster_unlock(ci); - return count && count != SWAP_MAP_BAD; + return swp_tb_get_count(swp_tb) > 0; } /* * How many references to @entry are currently swapped out? - * This considers COUNT_CONTINUED so it returns exact answer. + * This returns exact answer. */ int swp_swapcount(swp_entry_t entry) { - int count, tmp_count, n; struct swap_info_struct *si; struct swap_cluster_info *ci; - struct page *page; - pgoff_t offset; - unsigned char *map; + unsigned long swp_tb; + int count; si = get_swap_device(entry); if (!si) return 0; - offset = swp_offset(entry); - - ci = swap_cluster_lock(si, offset); - - count = si->swap_map[offset]; - if (!(count & COUNT_CONTINUED)) - goto out; - - count &= ~COUNT_CONTINUED; - n = SWAP_MAP_MAX + 1; - - page = vmalloc_to_page(si->swap_map + offset); - offset &= ~PAGE_MASK; - VM_BUG_ON(page_private(page) != SWP_CONTINUED); - - do { - page = list_next_entry(page, lru); - map = kmap_local_page(page); - tmp_count = map[offset]; - kunmap_local(map); - - count += (tmp_count & ~COUNT_CONTINUED) * n; - n *= (SWAP_CONT_MAX + 1); - } while (tmp_count & COUNT_CONTINUED); -out: + ci = swap_cluster_lock(si, swp_offset(entry)); + swp_tb = __swap_table_get(ci, swp_cluster_offset(entry)); + count = swp_tb_get_count(swp_tb); + if (count == SWP_TB_COUNT_MAX) + count = ci->extend_table[swp_cluster_offset(entry)]; swap_cluster_unlock(ci); put_swap_device(si); - return count; + + return count < 0 ? 0 : count; } static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, swp_entry_t entry, int order) { struct swap_cluster_info *ci; - unsigned char *map = si->swap_map; unsigned int nr_pages = 1 << order; unsigned long roffset = swp_offset(entry); unsigned long offset = round_down(roffset, nr_pages); + unsigned int ci_off; int i; bool ret = false; ci = swap_cluster_lock(si, offset); if (nr_pages == 1) { - if (map[roffset]) + ci_off = roffset % SWAPFILE_CLUSTER; + if (swp_tb_get_count(__swap_table_get(ci, ci_off))) ret = true; goto unlock_out; } for (i = 0; i < nr_pages; i++) { - if (map[offset + i]) { + ci_off = (offset + i) % SWAPFILE_CLUSTER; + if (swp_tb_get_count(__swap_table_get(ci, ci_off))) { ret = true; break; } @@ -2016,7 +2128,8 @@ void swap_free_hibernation_slot(swp_entry_t entry) return; ci = swap_cluster_lock(si, offset); - swap_put_entry_locked(si, ci, offset); + __swap_cluster_put_entry(ci, offset % SWAPFILE_CLUSTER); + __swap_cluster_free_entries(si, ci, offset % SWAPFILE_CLUSTER, 1); swap_cluster_unlock(ci); /* In theory readahead might add it to the swap cache by accident */ @@ -2242,13 +2355,10 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned int type) { pte_t *pte = NULL; - struct swap_info_struct *si; - si = swap_info[type]; do { struct folio *folio; - unsigned long offset; - unsigned char swp_count; + unsigned long swp_tb; softleaf_t entry; int ret; pte_t ptent; @@ -2267,7 +2377,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (swp_type(entry) != type) continue; - offset = swp_offset(entry); pte_unmap(pte); pte = NULL; @@ -2284,8 +2393,9 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, &vmf); } if (!folio) { - swp_count = READ_ONCE(si->swap_map[offset]); - if (swp_count == 0 || swp_count == SWAP_MAP_BAD) + swp_tb = swap_table_get(__swap_entry_to_cluster(entry), + swp_cluster_offset(entry)); + if (swp_tb_get_count(swp_tb) <= 0) continue; return -ENOMEM; } @@ -2413,7 +2523,7 @@ unlock: } /* - * Scan swap_map from current position to next entry still in use. + * Scan swap table from current position to next entry still in use. * Return 0 if there are no inuse entries after prev till end of * the map. */ @@ -2422,7 +2532,6 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, { unsigned int i; unsigned long swp_tb; - unsigned char count; /* * No need for swap_lock here: we're just looking @@ -2431,12 +2540,9 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, * allocations from this area (while holding swap_lock). */ for (i = prev + 1; i < si->max; i++) { - count = READ_ONCE(si->swap_map[i]); swp_tb = swap_table_get(__swap_offset_to_cluster(si, i), i % SWAPFILE_CLUSTER); - if (count == SWAP_MAP_BAD) - continue; - if (count || swp_tb_is_folio(swp_tb)) + if (!swp_tb_is_null(swp_tb) && !swp_tb_is_bad(swp_tb)) break; if ((i % LATENCY_LIMIT) == 0) cond_resched(); @@ -2796,7 +2902,6 @@ static void flush_percpu_swap_cluster(struct swap_info_struct *si) SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; - unsigned char *swap_map; unsigned long *zeromap; struct swap_cluster_info *cluster_info; struct file *swap_file, *victim; @@ -2874,8 +2979,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) flush_percpu_swap_cluster(p); destroy_swap_extents(p, p->swap_file); - if (p->flags & SWP_CONTINUED) - free_swap_count_continuations(p); if (!(p->flags & SWP_SOLIDSTATE)) atomic_dec(&nr_rotate_swap); @@ -2887,8 +2990,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) swap_file = p->swap_file; p->swap_file = NULL; - swap_map = p->swap_map; - p->swap_map = NULL; zeromap = p->zeromap; p->zeromap = NULL; maxpages = p->max; @@ -2902,7 +3003,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) mutex_unlock(&swapon_mutex); kfree(p->global_cluster); p->global_cluster = NULL; - vfree(swap_map); kvfree(zeromap); free_swap_cluster_info(cluster_info, maxpages); /* Destroy swap account information */ @@ -3122,7 +3222,6 @@ static struct swap_info_struct *alloc_swap_info(void) kvfree(defer); } spin_lock_init(&p->lock); - spin_lock_init(&p->cont_lock); atomic_long_set(&p->inuse_pages, SWAP_USAGE_OFFLIST_BIT); init_completion(&p->comp); @@ -3249,19 +3348,6 @@ static unsigned long read_swap_header(struct swap_info_struct *si, return maxpages; } -static int setup_swap_map(struct swap_info_struct *si, - union swap_header *swap_header, - unsigned long maxpages) -{ - unsigned char *swap_map; - - swap_map = vzalloc(maxpages); - si->swap_map = swap_map; - if (!swap_map) - return -ENOMEM; - return 0; -} - static int setup_swap_clusters_info(struct swap_info_struct *si, union swap_header *swap_header, unsigned long maxpages) @@ -3446,11 +3532,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) maxpages = si->max; - /* Setup the swap map and apply bad block */ - error = setup_swap_map(si, swap_header, maxpages); - if (error) - goto bad_swap_unlock_inode; - /* Set up the swap cluster info */ error = setup_swap_clusters_info(si, swap_header, maxpages); if (error) @@ -3571,8 +3652,6 @@ bad_swap: inode = NULL; destroy_swap_extents(si, swap_file); swap_cgroup_swapoff(si->type); - vfree(si->swap_map); - si->swap_map = NULL; free_swap_cluster_info(si->cluster_info, si->max); si->cluster_info = NULL; kvfree(si->zeromap); @@ -3613,322 +3692,29 @@ void si_swapinfo(struct sysinfo *val) spin_unlock(&swap_lock); } -/* - * Verify that nr swap entries are valid and increment their swap map counts. - * - * Returns error code in following case. - * - success -> 0 - * - swp_entry is invalid -> EINVAL - * - swap-mapped reference is requested but the entry is not used. -> ENOENT - * - swap-mapped reference requested but needs continued swap count. -> ENOMEM - */ -static int swap_dup_entries(struct swap_info_struct *si, - struct swap_cluster_info *ci, - unsigned long offset, - unsigned char usage, int nr) -{ - int i; - unsigned char count; - - for (i = 0; i < nr; i++) { - count = si->swap_map[offset + i]; - /* - * For swapin out, allocator never allocates bad slots. for - * swapin, readahead is guarded by swap_entry_swapped. - */ - if (WARN_ON(count == SWAP_MAP_BAD)) - return -ENOENT; - /* - * Swap count duplication must be guarded by either swap cache folio (from - * folio_dup_swap) or external lock of existing entry (from swap_dup_entry_direct). - */ - if (WARN_ON(!count && - !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER)))) - return -ENOENT; - if (WARN_ON((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)) - return -EINVAL; - } - - for (i = 0; i < nr; i++) { - count = si->swap_map[offset + i]; - if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) - count += usage; - else if (swap_count_continued(si, offset + i, count)) - count = COUNT_CONTINUED; - else { - /* - * Don't need to rollback changes, because if - * usage == 1, there must be nr == 1. - */ - return -ENOMEM; - } - - WRITE_ONCE(si->swap_map[offset + i], count); - } - - return 0; -} - -static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) -{ - int err; - struct swap_info_struct *si; - struct swap_cluster_info *ci; - unsigned long offset = swp_offset(entry); - - si = swap_entry_to_info(entry); - if (WARN_ON_ONCE(!si)) { - pr_err("%s%08lx\n", Bad_file, entry.val); - return -EINVAL; - } - - VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); - ci = swap_cluster_lock(si, offset); - err = swap_dup_entries(si, ci, offset, usage, nr); - swap_cluster_unlock(ci); - return err; -} - /* * swap_dup_entry_direct() - Increase reference count of a swap entry by one. * @entry: first swap entry from which we want to increase the refcount. * - * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required - * but could not be atomically allocated. Returns 0, just as if it succeeded, - * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which - * might occur if a page table entry has got corrupted. + * Returns 0 for success, or -ENOMEM if the extend table is required + * but could not be atomically allocated. Returns -EINVAL if the swap + * entry is invalid, which might occur if a page table entry has got + * corrupted. * * Context: Caller must ensure there is no race condition on the reference * owner. e.g., locking the PTL of a PTE containing the entry being increased. */ int swap_dup_entry_direct(swp_entry_t entry) -{ - int err = 0; - while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM) - err = add_swap_count_continuation(entry, GFP_ATOMIC); - return err; -} - -/* - * add_swap_count_continuation - called when a swap count is duplicated - * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's - * page of the original vmalloc'ed swap_map, to hold the continuation count - * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called - * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. - * - * These continuation pages are seldom referenced: the common paths all work - * on the original swap_map, only referring to a continuation page when the - * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. - * - * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding - * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) - * can be called after dropping locks. - */ -int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) { struct swap_info_struct *si; - struct swap_cluster_info *ci; - struct page *head; - struct page *page; - struct page *list_page; - pgoff_t offset; - unsigned char count; - int ret = 0; - - /* - * When debugging, it's easier to use __GFP_ZERO here; but it's better - * for latency not to zero a page while GFP_ATOMIC and holding locks. - */ - page = alloc_page(gfp_mask | __GFP_HIGHMEM); - - si = get_swap_device(entry); - if (!si) { - /* - * An acceptable race has occurred since the failing - * __swap_duplicate(): the swap device may be swapoff - */ - goto outer; - } - - offset = swp_offset(entry); - - ci = swap_cluster_lock(si, offset); - - count = si->swap_map[offset]; - - if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { - /* - * The higher the swap count, the more likely it is that tasks - * will race to add swap count continuation: we need to avoid - * over-provisioning. - */ - goto out; - } - - if (!page) { - ret = -ENOMEM; - goto out; - } - - head = vmalloc_to_page(si->swap_map + offset); - offset &= ~PAGE_MASK; - - spin_lock(&si->cont_lock); - /* - * Page allocation does not initialize the page's lru field, - * but it does always reset its private field. - */ - if (!page_private(head)) { - BUG_ON(count & COUNT_CONTINUED); - INIT_LIST_HEAD(&head->lru); - set_page_private(head, SWP_CONTINUED); - si->flags |= SWP_CONTINUED; - } - - list_for_each_entry(list_page, &head->lru, lru) { - unsigned char *map; - - /* - * If the previous map said no continuation, but we've found - * a continuation page, free our allocation and use this one. - */ - if (!(count & COUNT_CONTINUED)) - goto out_unlock_cont; - - map = kmap_local_page(list_page) + offset; - count = *map; - kunmap_local(map); - - /* - * If this continuation count now has some space in it, - * free our allocation and use this one. - */ - if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) - goto out_unlock_cont; - } - list_add_tail(&page->lru, &head->lru); - page = NULL; /* now it's attached, don't free it */ -out_unlock_cont: - spin_unlock(&si->cont_lock); -out: - swap_cluster_unlock(ci); - put_swap_device(si); -outer: - if (page) - __free_page(page); - return ret; -} - -/* - * swap_count_continued - when the original swap_map count is incremented - * from SWAP_MAP_MAX, check if there is already a continuation page to carry - * into, carry if so, or else fail until a new continuation page is allocated; - * when the original swap_map count is decremented from 0 with continuation, - * borrow from the continuation and report whether it still holds more. - * Called while __swap_duplicate() or caller of swap_put_entry_locked() - * holds cluster lock. - */ -static bool swap_count_continued(struct swap_info_struct *si, - pgoff_t offset, unsigned char count) -{ - struct page *head; - struct page *page; - unsigned char *map; - bool ret; - - head = vmalloc_to_page(si->swap_map + offset); - if (page_private(head) != SWP_CONTINUED) { - BUG_ON(count & COUNT_CONTINUED); - return false; /* need to add count continuation */ - } - - spin_lock(&si->cont_lock); - offset &= ~PAGE_MASK; - page = list_next_entry(head, lru); - map = kmap_local_page(page) + offset; - - if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ - goto init_map; /* jump over SWAP_CONT_MAX checks */ - - if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ - /* - * Think of how you add 1 to 999 - */ - while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { - kunmap_local(map); - page = list_next_entry(page, lru); - BUG_ON(page == head); - map = kmap_local_page(page) + offset; - } - if (*map == SWAP_CONT_MAX) { - kunmap_local(map); - page = list_next_entry(page, lru); - if (page == head) { - ret = false; /* add count continuation */ - goto out; - } - map = kmap_local_page(page) + offset; -init_map: *map = 0; /* we didn't zero the page */ - } - *map += 1; - kunmap_local(map); - while ((page = list_prev_entry(page, lru)) != head) { - map = kmap_local_page(page) + offset; - *map = COUNT_CONTINUED; - kunmap_local(map); - } - ret = true; /* incremented */ - - } else { /* decrementing */ - /* - * Think of how you subtract 1 from 1000 - */ - BUG_ON(count != COUNT_CONTINUED); - while (*map == COUNT_CONTINUED) { - kunmap_local(map); - page = list_next_entry(page, lru); - BUG_ON(page == head); - map = kmap_local_page(page) + offset; - } - BUG_ON(*map == 0); - *map -= 1; - if (*map == 0) - count = 0; - kunmap_local(map); - while ((page = list_prev_entry(page, lru)) != head) { - map = kmap_local_page(page) + offset; - *map = SWAP_CONT_MAX | count; - count = COUNT_CONTINUED; - kunmap_local(map); - } - ret = count == COUNT_CONTINUED; + si = swap_entry_to_info(entry); + if (WARN_ON_ONCE(!si)) { + pr_err("%s%08lx\n", Bad_file, entry.val); + return -EINVAL; } -out: - spin_unlock(&si->cont_lock); - return ret; -} -/* - * free_swap_count_continuations - swapoff free all the continuation pages - * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. - */ -static void free_swap_count_continuations(struct swap_info_struct *si) -{ - pgoff_t offset; - - for (offset = 0; offset < si->max; offset += PAGE_SIZE) { - struct page *head; - head = vmalloc_to_page(si->swap_map + offset); - if (page_private(head)) { - struct page *page, *next; - - list_for_each_entry_safe(page, next, &head->lru, lru) { - list_del(&page->lru); - __free_page(page); - } - } - } + return swap_dup_entries_cluster(si, swp_offset(entry), 1); } #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) -- cgit v1.2.3 From 1beb9b7223d2a1f1872f76a3d29b0a4a3cee4171 Mon Sep 17 00:00:00 2001 From: "Pratyush Yadav (Google)" Date: Mon, 16 Feb 2026 19:59:32 +0100 Subject: memfd: export memfd_{add,get}_seals() Patch series "mm: memfd_luo: preserve file seals", v2. This series adds support for preserving file seals when preserving a memfd using LUO. Patch 1 exports some memfd seal manipulation functions and patch 2 adds support for preserving them. Since it makes changes to the serialized data structure for memfd, it also bumps the version number. This patch (of 2): Support for preserving file seals will be added to memfd preservation using the Live Update Orchestrator (LUO). Export memfd_{add,get}_seals)() so memfd_luo can use them to manipulate the seals. Link: https://lkml.kernel.org/r/20260216185946.1215770-1-pratyush@kernel.org Link: https://lkml.kernel.org/r/20260216185946.1215770-2-pratyush@kernel.org Signed-off-by: Pratyush Yadav (Google) Acked-by: Mike Rapoport (Microsoft) Tested-by: Samiullah Khawaja Cc: Alexander Graf Cc: Baolin Wang Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- include/linux/memfd.h | 12 ++++++++++++ mm/memfd.c | 4 ++-- 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/memfd.h b/include/linux/memfd.h index c328a7b356d0..b4fda09dab9f 100644 --- a/include/linux/memfd.h +++ b/include/linux/memfd.h @@ -18,6 +18,8 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx); */ int memfd_check_seals_mmap(struct file *file, vm_flags_t *vm_flags_ptr); struct file *memfd_alloc_file(const char *name, unsigned int flags); +int memfd_get_seals(struct file *file); +int memfd_add_seals(struct file *file, unsigned int seals); #else static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a) { @@ -37,6 +39,16 @@ static inline struct file *memfd_alloc_file(const char *name, unsigned int flags { return ERR_PTR(-EINVAL); } + +static inline int memfd_get_seals(struct file *file) +{ + return -EINVAL; +} + +static inline int memfd_add_seals(struct file *file, unsigned int seals) +{ + return -EINVAL; +} #endif #endif /* __LINUX_MEMFD_H */ diff --git a/mm/memfd.c b/mm/memfd.c index 919c2a53eb96..fb425f4e315f 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -227,7 +227,7 @@ static unsigned int *memfd_file_seals_ptr(struct file *file) F_SEAL_WRITE | \ F_SEAL_FUTURE_WRITE) -static int memfd_add_seals(struct file *file, unsigned int seals) +int memfd_add_seals(struct file *file, unsigned int seals) { struct inode *inode = file_inode(file); unsigned int *file_seals; @@ -309,7 +309,7 @@ unlock: return error; } -static int memfd_get_seals(struct file *file) +int memfd_get_seals(struct file *file) { unsigned int *seals = memfd_file_seals_ptr(file); -- cgit v1.2.3 From 8a552d68a86ef0e6fb2ff4af13031a5e82c0f1d0 Mon Sep 17 00:00:00 2001 From: "Pratyush Yadav (Google)" Date: Mon, 16 Feb 2026 19:59:33 +0100 Subject: mm: memfd_luo: preserve file seals File seals are used on memfd for making shared memory communication with untrusted peers safer and simpler. Seals provide a guarantee that certain operations won't be allowed on the file such as writes or truncations. Maintaining these guarantees across a live update will help keeping such use cases secure. These guarantees will also be needed for IOMMUFD preservation with LUO. Normally when IOMMUFD maps a memfd, it pins all its pages to make sure any truncation operations on the memfd don't lead to IOMMUFD using freed memory. This doesn't work with LUO since the preserved memfd might have completely different pages after a live update, and mapping them back to the IOMMUFD will cause all sorts of problems. Using and preserving the seals allows IOMMUFD preservation logic to trust the memfd. Since the uABI defines seals as an int, preserve them by introducing a new u32 field. There are currently only 6 possible seals, so the extra bits are unused and provide room for future expansion. Since the seals are uABI, it is safe to use them directly in the ABI. While at it, also add a u32 flags field. It makes sure the struct is nicely aligned, and can be used later to support things like MFD_CLOEXEC. Since the serialization structure is changed, bump the version number to "memfd-v2". It is important to note that the memfd-v2 version only supports seals that existed when this version was defined. This set is defined by MEMFD_LUO_ALL_SEALS. Any new seal might bring a completely different semantic with it and the parser for memfd-v2 cannot be expected to deal with that. If there are any future seals added, they will need another version bump. Link: https://lkml.kernel.org/r/20260216185946.1215770-3-pratyush@kernel.org Signed-off-by: Pratyush Yadav (Google) Tested-by: Samiullah Khawaja Cc: Alexander Graf Cc: Baolin Wang Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: Mike Rapoport (Microsoft) Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- include/linux/kho/abi/memfd.h | 18 +++++++++++++++++- mm/memfd_luo.c | 35 +++++++++++++++++++++++++++++++++-- 2 files changed, 50 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/kho/abi/memfd.h b/include/linux/kho/abi/memfd.h index 68cb6303b846..08b10fea2afc 100644 --- a/include/linux/kho/abi/memfd.h +++ b/include/linux/kho/abi/memfd.h @@ -56,10 +56,24 @@ struct memfd_luo_folio_ser { u64 index; } __packed; +/* + * The set of seals this version supports preserving. If support for any new + * seals is needed, add it here and bump version. + */ +#define MEMFD_LUO_ALL_SEALS (F_SEAL_SEAL | \ + F_SEAL_SHRINK | \ + F_SEAL_GROW | \ + F_SEAL_WRITE | \ + F_SEAL_FUTURE_WRITE | \ + F_SEAL_EXEC) + /** * struct memfd_luo_ser - Main serialization structure for a memfd. * @pos: The file's current position (f_pos). * @size: The total size of the file in bytes (i_size). + * @seals: The seals present on the memfd. The seals are uABI so it is safe + * to directly use them in the ABI. + * @flags: Flags for the file. Unused flag bits must be set to 0. * @nr_folios: Number of folios in the folios array. * @folios: KHO vmalloc descriptor pointing to the array of * struct memfd_luo_folio_ser. @@ -67,11 +81,13 @@ struct memfd_luo_folio_ser { struct memfd_luo_ser { u64 pos; u64 size; + u32 seals; + u32 flags; u64 nr_folios; struct kho_vmalloc folios; } __packed; /* The compatibility string for memfd file handler */ -#define MEMFD_LUO_FH_COMPATIBLE "memfd-v1" +#define MEMFD_LUO_FH_COMPATIBLE "memfd-v2" #endif /* _LINUX_KHO_ABI_MEMFD_H */ diff --git a/mm/memfd_luo.c b/mm/memfd_luo.c index b8edb9f981d7..bc7f4f045edf 100644 --- a/mm/memfd_luo.c +++ b/mm/memfd_luo.c @@ -79,6 +79,8 @@ #include #include #include +#include + #include "internal.h" static int memfd_luo_preserve_folios(struct file *file, @@ -259,7 +261,7 @@ static int memfd_luo_preserve(struct liveupdate_file_op_args *args) struct memfd_luo_folio_ser *folios_ser; struct memfd_luo_ser *ser; u64 nr_folios; - int err = 0; + int err = 0, seals; inode_lock(inode); shmem_freeze(inode, true); @@ -271,8 +273,21 @@ static int memfd_luo_preserve(struct liveupdate_file_op_args *args) goto err_unlock; } + seals = memfd_get_seals(args->file); + if (seals < 0) { + err = seals; + goto err_free_ser; + } + + /* Make sure the file only has the seals supported by this version. */ + if (seals & ~MEMFD_LUO_ALL_SEALS) { + err = -EOPNOTSUPP; + goto err_free_ser; + } + ser->pos = args->file->f_pos; ser->size = i_size_read(inode); + ser->seals = seals; err = memfd_luo_preserve_folios(args->file, &ser->folios, &folios_ser, &nr_folios); @@ -486,13 +501,29 @@ static int memfd_luo_retrieve(struct liveupdate_file_op_args *args) if (!ser) return -EINVAL; - file = memfd_alloc_file("", 0); + /* Make sure the file only has seals supported by this version. */ + if (ser->seals & ~MEMFD_LUO_ALL_SEALS) { + err = -EOPNOTSUPP; + goto free_ser; + } + + /* + * The seals are preserved. Allow sealing here so they can be added + * later. + */ + file = memfd_alloc_file("", MFD_ALLOW_SEALING); if (IS_ERR(file)) { pr_err("failed to setup file: %pe\n", file); err = PTR_ERR(file); goto free_ser; } + err = memfd_add_seals(file, ser->seals); + if (err) { + pr_err("failed to add seals: %pe\n", ERR_PTR(err)); + goto put_file; + } + vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE); file->f_inode->i_size = ser->size; -- cgit v1.2.3 From c9cb94c6b85a2854ae03c874331b0880ee735441 Mon Sep 17 00:00:00 2001 From: Asier Gutierrez Date: Fri, 13 Feb 2026 14:50:32 +0000 Subject: mm/damon: remove unused target param of get_scheme_score() damon_target is not used by get_scheme_score operations, nor with virtual neither with physical addresses. Link: https://lkml.kernel.org/r/20260213145032.1740407-1-gutierrez.asier@huawei-partners.com Signed-off-by: Asier Gutierrez Reviewed-by: SeongJae Park Cc: Kefeng Wang Cc: Quanmin Yan Cc: ze zuo Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 +-- mm/damon/core.c | 10 +++++----- mm/damon/paddr.c | 3 +-- mm/damon/vaddr.c | 3 +-- 4 files changed, 8 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index be3d198043ff..60e6da3012fa 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -647,8 +647,7 @@ struct damon_operations { void (*prepare_access_checks)(struct damon_ctx *context); unsigned int (*check_accesses)(struct damon_ctx *context); int (*get_scheme_score)(struct damon_ctx *context, - struct damon_target *t, struct damon_region *r, - struct damos *scheme); + struct damon_region *r, struct damos *scheme); unsigned long (*apply_scheme)(struct damon_ctx *context, struct damon_target *t, struct damon_region *r, struct damos *scheme, unsigned long *sz_filter_passed); diff --git a/mm/damon/core.c b/mm/damon/core.c index 3e1890d64d06..0e5ada441b05 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1689,15 +1689,15 @@ static bool __damos_valid_target(struct damon_region *r, struct damos *s) r->age <= s->pattern.max_age_region; } -static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, - struct damon_region *r, struct damos *s) +static bool damos_valid_target(struct damon_ctx *c, struct damon_region *r, + struct damos *s) { bool ret = __damos_valid_target(r, s); if (!ret || !s->quota.esz || !c->ops.get_scheme_score) return ret; - return c->ops.get_scheme_score(c, t, r, s) >= s->quota.min_score; + return c->ops.get_scheme_score(c, r, s) >= s->quota.min_score; } /* @@ -2021,7 +2021,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, s->max_nr_snapshots <= s->stat.nr_snapshots) continue; - if (damos_valid_target(c, t, r, s)) + if (damos_valid_target(c, r, s)) damos_apply_scheme(c, t, r, s); if (damon_is_last_region(r, t)) @@ -2319,7 +2319,7 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) damon_for_each_region(r, t) { if (!__damos_valid_target(r, s)) continue; - score = c->ops.get_scheme_score(c, t, r, s); + score = c->ops.get_scheme_score(c, r, s); c->regions_score_histogram[score] += damon_sz_region(r); if (score > max_score) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 9bfe48826840..5cdcc5037cbc 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -343,8 +343,7 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, } static int damon_pa_scheme_score(struct damon_ctx *context, - struct damon_target *t, struct damon_region *r, - struct damos *scheme) + struct damon_region *r, struct damos *scheme) { switch (scheme->action) { case DAMOS_PAGEOUT: diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 729b7ffd3565..4d6d8251d419 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -985,8 +985,7 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, } static int damon_va_scheme_score(struct damon_ctx *context, - struct damon_target *t, struct damon_region *r, - struct damos *scheme) + struct damon_region *r, struct damos *scheme) { switch (scheme->action) { -- cgit v1.2.3 From 5ad41a38c36474ff59545cb514801d90719555de Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Fri, 13 Feb 2026 15:18:22 +0800 Subject: mm: zswap: add per-memcg stat for incompressible pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: zswap: add per-memcg stat for incompressible pages", v3. In containerized environments, knowing which cgroup is contributing incompressible pages to zswap is essential for effective resource management. This series adds a new per-memcg stat 'zswap_incomp' to track incompressible pages, along with a selftest. This patch (of 2): The global zswap_stored_incompressible_pages counter was added in commit dca4437a5861 ("mm/zswap: store Acked-by: Nhat Pham Acked-by: Shakeel Butt Reviewed-by: Yosry Ahmed Reviewed-by: SeongJae Park Cc: Johannes Weiner Cc: Chengming Zhou Cc: Jonathan Corbet Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Shuah Khan Cc: Tejun Heo Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v2.rst | 5 +++++ include/linux/memcontrol.h | 1 + mm/memcontrol.c | 6 ++++++ 3 files changed, 12 insertions(+) (limited to 'include') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 91beaa6798ce..8ad0b2781317 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1734,6 +1734,11 @@ The following nested keys are defined. zswpwb Number of pages written from zswap to swap. + zswap_incomp + Number of incompressible pages currently stored in zswap + without compression. These pages could not be compressed to + a size smaller than PAGE_SIZE, so they are stored as-is. + thp_fault_alloc (npn) Number of transparent hugepages which were allocated to satisfy a page fault. This counter is not present when CONFIG_TRANSPARENT_HUGEPAGE diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 70b685a85bf4..5695776f32c8 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -39,6 +39,7 @@ enum memcg_stat_item { MEMCG_KMEM, MEMCG_ZSWAP_B, MEMCG_ZSWAPPED, + MEMCG_ZSWAP_INCOMP, MEMCG_NR_STAT, }; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 823ac6a05bf3..75df24ffdf25 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -356,6 +356,7 @@ static const unsigned int memcg_stat_items[] = { MEMCG_KMEM, MEMCG_ZSWAP_B, MEMCG_ZSWAPPED, + MEMCG_ZSWAP_INCOMP, }; #define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items) @@ -1368,6 +1369,7 @@ static const struct memory_stat memory_stats[] = { #ifdef CONFIG_ZSWAP { "zswap", MEMCG_ZSWAP_B }, { "zswapped", MEMCG_ZSWAPPED }, + { "zswap_incomp", MEMCG_ZSWAP_INCOMP }, #endif { "file_mapped", NR_FILE_MAPPED }, { "file_dirty", NR_FILE_DIRTY }, @@ -5520,6 +5522,8 @@ void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size) memcg = obj_cgroup_memcg(objcg); mod_memcg_state(memcg, MEMCG_ZSWAP_B, size); mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1); + if (size == PAGE_SIZE) + mod_memcg_state(memcg, MEMCG_ZSWAP_INCOMP, 1); rcu_read_unlock(); } @@ -5543,6 +5547,8 @@ void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) memcg = obj_cgroup_memcg(objcg); mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size); mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1); + if (size == PAGE_SIZE) + mod_memcg_state(memcg, MEMCG_ZSWAP_INCOMP, -1); rcu_read_unlock(); } -- cgit v1.2.3 From c5c48345135ff04e039377020df23294d59aa59a Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Wed, 11 Feb 2026 16:54:47 -0500 Subject: mm: name the anonymous MMOP enum as enum mmop Give the MMOP enum (MMOP_OFFLINE, MMOP_ONLINE, etc) a proper type name so the compiler can help catch invalid values being assigned to variables of this type. Leave the existing functions returning int alone to allow for value-or-error pattern to remain unchanged without churn. mmop_default_online_type is left as int because it uses the -1 sentinal value to signal it hasn't been initialized yet. Keep the uint8_t buffer in offline_and_remove_memory() as-is for space efficiency, with an explicit cast when we consume the value. Move the enum definition before the CONFIG_MEMORY_HOTPLUG guard so it is unconditionally available for struct memory_block in memory.h. No functional change. Link: https://lore.kernel.org/linux-mm/3424eba7-523b-4351-abd0-3a888a3e5e61@kernel.org/ Link: https://lkml.kernel.org/r/20260211215447.2194189-1-gourry@gourry.net Signed-off-by: Gregory Price Suggested-by: Jonathan Cameron Suggested-by: "David Hildenbrand (arm)" Reviewed-by: Ben Cheatham Acked-by: David Hildenbrand (Arm) Reviewed-by: Dave Jiang Reviewed-by: Davidlohr Bueso Reviewed-by: Jonathan Cameron Cc: Danilo Krummrich Cc: Greg Kroah-Hartman Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- drivers/base/memory.c | 2 +- include/linux/memory.h | 3 ++- include/linux/memory_hotplug.h | 16 ++++++++-------- mm/memory_hotplug.c | 10 +++++----- 4 files changed, 16 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index a3091924918b..5380050b16b7 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -452,7 +452,7 @@ static ssize_t phys_device_show(struct device *dev, static int print_allowed_zone(char *buf, int len, int nid, struct memory_group *group, unsigned long start_pfn, unsigned long nr_pages, - int online_type, struct zone *default_zone) + enum mmop online_type, struct zone *default_zone) { struct zone *zone; diff --git a/include/linux/memory.h b/include/linux/memory.h index faeaa921e55b..5bb5599c6b2b 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -19,6 +19,7 @@ #include #include #include +#include #define MIN_MEMORY_BLOCK_SIZE (1UL << SECTION_SIZE_BITS) @@ -77,7 +78,7 @@ enum memory_block_state { struct memory_block { unsigned long start_section_nr; enum memory_block_state state; /* serialized by the dev->lock */ - int online_type; /* for passing data to online routine */ + enum mmop online_type; /* for passing data to online routine */ int nid; /* NID for this memory block */ /* * The single zone of this memory block if all PFNs of this memory block diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index f2f16cdd73ee..e77ef3d7ff73 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -16,11 +16,8 @@ struct resource; struct vmem_altmap; struct dev_pagemap; -#ifdef CONFIG_MEMORY_HOTPLUG -struct page *pfn_to_online_page(unsigned long pfn); - /* Types for control the zone type of onlined and offlined memory */ -enum { +enum mmop { /* Offline the memory. */ MMOP_OFFLINE = 0, /* Online the memory. Zone depends, see default_zone_for_pfn(). */ @@ -31,6 +28,9 @@ enum { MMOP_ONLINE_MOVABLE, }; +#ifdef CONFIG_MEMORY_HOTPLUG +struct page *pfn_to_online_page(unsigned long pfn); + /* Flags for add_memory() and friends to specify memory hotplug details. */ typedef int __bitwise mhp_t; @@ -286,8 +286,8 @@ static inline void __remove_memory(u64 start, u64 size) {} #ifdef CONFIG_MEMORY_HOTPLUG /* Default online_type (MMOP_*) when new memory blocks are added. */ -extern int mhp_get_default_online_type(void); -extern void mhp_set_default_online_type(int online_type); +extern enum mmop mhp_get_default_online_type(void); +extern void mhp_set_default_online_type(enum mmop online_type); extern void __ref free_area_init_core_hotplug(struct pglist_data *pgdat); extern int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); extern int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); @@ -310,8 +310,8 @@ extern void sparse_remove_section(unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); -extern struct zone *zone_for_pfn_range(int online_type, int nid, - struct memory_group *group, unsigned long start_pfn, +extern struct zone *zone_for_pfn_range(enum mmop online_type, + int nid, struct memory_group *group, unsigned long start_pfn, unsigned long nr_pages); extern int arch_create_linear_mapping(int nid, u64 start, u64 size, struct mhp_params *params); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index bc805029da51..a602310bdf33 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -221,7 +221,7 @@ void put_online_mems(void) bool movable_node_enabled = false; static int mhp_default_online_type = -1; -int mhp_get_default_online_type(void) +enum mmop mhp_get_default_online_type(void) { if (mhp_default_online_type >= 0) return mhp_default_online_type; @@ -240,7 +240,7 @@ int mhp_get_default_online_type(void) return mhp_default_online_type; } -void mhp_set_default_online_type(int online_type) +void mhp_set_default_online_type(enum mmop online_type) { mhp_default_online_type = online_type; } @@ -1046,7 +1046,7 @@ static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn return movable_node_enabled ? movable_zone : kernel_zone; } -struct zone *zone_for_pfn_range(int online_type, int nid, +struct zone *zone_for_pfn_range(enum mmop online_type, int nid, struct memory_group *group, unsigned long start_pfn, unsigned long nr_pages) { @@ -2305,7 +2305,7 @@ EXPORT_SYMBOL_GPL(remove_memory); static int try_offline_memory_block(struct memory_block *mem, void *arg) { - uint8_t online_type = MMOP_ONLINE_KERNEL; + enum mmop online_type = MMOP_ONLINE_KERNEL; uint8_t **online_types = arg; struct page *page; int rc; @@ -2338,7 +2338,7 @@ static int try_reonline_memory_block(struct memory_block *mem, void *arg) int rc; if (**online_types != MMOP_OFFLINE) { - mem->online_type = **online_types; + mem->online_type = (enum mmop)**online_types; rc = device_online(&mem->dev); if (rc < 0) pr_warn("%s: Failed to re-online memory: %d", -- cgit v1.2.3 From 652d12bc74a075f345f228f8945e05517a38874d Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 11 Feb 2026 12:31:38 +0200 Subject: mm: don't special case !MMU for is_zero_pfn() and my_zero_pfn() Patch series "arch, mm: consolidate empty_zero_page", v3. These patches cleanup handling of ZERO_PAGE() and zero_pfn. This patch (of 4): nommu architectures have empty_zero_page and define ZERO_PAGE() and although they don't really use it to populate page tables, there is no reason to hardwire !MMU implementation of is_zero_pfn() and my_zero_pfn() to 0. Drop #ifdef CONFIG_MMU around implementations of is_zero_pfn() and my_zero_pfn() and remove !MMU version. While on it, make zero_pfn __ro_after_init. Link: https://lkml.kernel.org/r/20260211103141.3215197-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20260211103141.3215197-2-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: David Hildenbrand (Arm) Acked-by: Liam R. Howlett Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Cc: Christophe Leroy (CS GROUP) Cc: Dave Hansen Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 14 +------------- mm/memory.c | 13 ------------- mm/mm_init.c | 10 ++++++++++ 3 files changed, 11 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index a50df42a893f..5e772599d9a5 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1917,7 +1917,6 @@ static inline void pfnmap_setup_cachemode_pfn(unsigned long pfn, pgprot_t *prot) pfnmap_setup_cachemode(pfn, PAGE_SIZE, prot); } -#ifdef CONFIG_MMU #ifdef __HAVE_COLOR_ZERO_PAGE static inline int is_zero_pfn(unsigned long pfn) { @@ -1940,18 +1939,7 @@ static inline unsigned long my_zero_pfn(unsigned long addr) extern unsigned long zero_pfn; return zero_pfn; } -#endif -#else -static inline int is_zero_pfn(unsigned long pfn) -{ - return 0; -} - -static inline unsigned long my_zero_pfn(unsigned long addr) -{ - return 0; -} -#endif /* CONFIG_MMU */ +#endif /* __HAVE_COLOR_ZERO_PAGE */ #ifdef CONFIG_MMU diff --git a/mm/memory.c b/mm/memory.c index 7084c426f933..6b504fc5e815 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -162,21 +162,8 @@ static int __init disable_randmaps(char *s) } __setup("norandmaps", disable_randmaps); -unsigned long zero_pfn __read_mostly; -EXPORT_SYMBOL(zero_pfn); - unsigned long highest_memmap_pfn __read_mostly; -/* - * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() - */ -static int __init init_zero_pfn(void) -{ - zero_pfn = page_to_pfn(ZERO_PAGE(0)); - return 0; -} -early_initcall(init_zero_pfn); - void mm_trace_rss_stat(struct mm_struct *mm, int member) { trace_rss_stat(mm, member); diff --git a/mm/mm_init.c b/mm/mm_init.c index df34797691bd..f3755a66b9d0 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -53,6 +53,9 @@ EXPORT_SYMBOL(mem_map); void *high_memory; EXPORT_SYMBOL(high_memory); +unsigned long zero_pfn __ro_after_init; +EXPORT_SYMBOL(zero_pfn); + #ifdef CONFIG_DEBUG_MEMORY_INIT int __meminitdata mminit_loglevel; @@ -2672,6 +2675,13 @@ static void __init mem_init_print_info(void) ); } +static int __init init_zero_pfn(void) +{ + zero_pfn = page_to_pfn(ZERO_PAGE(0)); + return 0; +} +early_initcall(init_zero_pfn); + void __init __weak arch_mm_preinit(void) { } -- cgit v1.2.3 From 9a1d0c738b45ea8da4e6897099c708e89f43daad Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 11 Feb 2026 12:31:39 +0200 Subject: mm: rename my_zero_pfn() to zero_pfn() my_zero_pfn() is a silly name. Rename zero_pfn variable to zero_page_pfn and my_zero_pfn() function to zero_pfn(). While on it, move extern declarations of zero_page_pfn outside the functions that use it and add a comment about what ZERO_PAGE is. Link: https://lkml.kernel.org/r/20260211103141.3215197-3-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: David Hildenbrand (Arm) Acked-by: Vlastimil Babka Reviewed-by: Liam R. Howlett Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Christophe Leroy (CS GROUP) Cc: Dave Hansen Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/x86/kvm/mmu/spte.h | 2 +- fs/dax.c | 2 +- fs/proc/vmcore.c | 2 +- include/linux/pgtable.h | 28 ++++++++++++++++++++-------- mm/huge_memory.c | 2 +- mm/memory.c | 2 +- mm/migrate.c | 2 +- mm/mm_init.c | 10 +++++----- mm/userfaultfd.c | 4 ++-- 9 files changed, 33 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 91ce29fd6f1b..8c0ffa2cded6 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -248,7 +248,7 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; static inline hpa_t kvm_mmu_get_dummy_root(void) { - return my_zero_pfn(0) << PAGE_SHIFT; + return zero_pfn(0) << PAGE_SHIFT; } static inline bool kvm_mmu_is_dummy_root(hpa_t shadow_page) diff --git a/fs/dax.c b/fs/dax.c index 289e6254aa30..b78cff9c91b3 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1360,7 +1360,7 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf, { struct inode *inode = iter->inode; unsigned long vaddr = vmf->address; - unsigned long pfn = my_zero_pfn(vaddr); + unsigned long pfn = zero_pfn(vaddr); vm_fault_t ret; *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE); diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index f188bd900eb2..44d15436439f 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -525,7 +525,7 @@ static int remap_oldmem_pfn_checked(struct vm_area_struct *vma, { unsigned long map_size; unsigned long pos_start, pos_end, pos; - unsigned long zeropage_pfn = my_zero_pfn(0); + unsigned long zeropage_pfn = zero_pfn(0); size_t len = 0; pos_start = pfn; diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 5e772599d9a5..c3a56f6b1ea5 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1917,27 +1917,39 @@ static inline void pfnmap_setup_cachemode_pfn(unsigned long pfn, pgprot_t *prot) pfnmap_setup_cachemode(pfn, PAGE_SIZE, prot); } +/* + * ZERO_PAGE() is global shared page(s) that is always zero. It is used for + * zero-mapped memory areas, CoW etc. + * + * On architectures that __HAVE_COLOR_ZERO_PAGE there are several such pages + * for different ranges in the virtual address space. + * + * zero_page_pfn identifies the first (or the only) pfn for these pages. + */ #ifdef __HAVE_COLOR_ZERO_PAGE static inline int is_zero_pfn(unsigned long pfn) { - extern unsigned long zero_pfn; - unsigned long offset_from_zero_pfn = pfn - zero_pfn; + extern unsigned long zero_page_pfn; + unsigned long offset_from_zero_pfn = pfn - zero_page_pfn; + return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); } -#define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) +#define zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) #else static inline int is_zero_pfn(unsigned long pfn) { - extern unsigned long zero_pfn; - return pfn == zero_pfn; + extern unsigned long zero_page_pfn; + + return pfn == zero_page_pfn; } -static inline unsigned long my_zero_pfn(unsigned long addr) +static inline unsigned long zero_pfn(unsigned long addr) { - extern unsigned long zero_pfn; - return zero_pfn; + extern unsigned long zero_page_pfn; + + return zero_page_pfn; } #endif /* __HAVE_COLOR_ZERO_PAGE */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b298cba853ab..a132fb98ed5d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2972,7 +2972,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { pte_t entry; - entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot); + entry = pfn_pte(zero_pfn(addr), vma->vm_page_prot); entry = pte_mkspecial(entry); if (pmd_uffd_wp(old_pmd)) entry = pte_mkuffd_wp(entry); diff --git a/mm/memory.c b/mm/memory.c index 6b504fc5e815..af26a697562b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5224,7 +5224,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) /* Use the zero-page for reads */ if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm)) { - entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), + entry = pte_mkspecial(pfn_pte(zero_pfn(vmf->address), vma->vm_page_prot)); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); diff --git a/mm/migrate.c b/mm/migrate.c index 2c3d489ecf51..6cc654858da6 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -321,7 +321,7 @@ static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw, if (!pages_identical(page, ZERO_PAGE(0))) return false; - newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address), + newpte = pte_mkspecial(pfn_pte(zero_pfn(pvmw->address), pvmw->vma->vm_page_prot)); if (pte_swp_soft_dirty(old_pte)) diff --git a/mm/mm_init.c b/mm/mm_init.c index f3755a66b9d0..ab6578516dd6 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -53,8 +53,8 @@ EXPORT_SYMBOL(mem_map); void *high_memory; EXPORT_SYMBOL(high_memory); -unsigned long zero_pfn __ro_after_init; -EXPORT_SYMBOL(zero_pfn); +unsigned long zero_page_pfn __ro_after_init; +EXPORT_SYMBOL(zero_page_pfn); #ifdef CONFIG_DEBUG_MEMORY_INIT int __meminitdata mminit_loglevel; @@ -2675,12 +2675,12 @@ static void __init mem_init_print_info(void) ); } -static int __init init_zero_pfn(void) +static int __init init_zero_page_pfn(void) { - zero_pfn = page_to_pfn(ZERO_PAGE(0)); + zero_page_pfn = page_to_pfn(ZERO_PAGE(0)); return 0; } -early_initcall(init_zero_pfn); +early_initcall(init_zero_page_pfn); void __init __weak arch_mm_preinit(void) { diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 927086bb4a3c..e19872e51878 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -357,7 +357,7 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, if (mm_forbids_zeropage(dst_vma->vm_mm)) return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr); - _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), + _dst_pte = pte_mkspecial(pfn_pte(zero_pfn(dst_addr), dst_vma->vm_page_prot)); ret = -EAGAIN; dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl); @@ -1229,7 +1229,7 @@ static int move_zeropage_pte(struct mm_struct *mm, return -EAGAIN; } - zero_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), + zero_pte = pte_mkspecial(pfn_pte(zero_pfn(dst_addr), dst_vma->vm_page_prot)); ptep_clear_flush(src_vma, src_addr, src_pte); set_pte_at(mm, dst_addr, dst_pte, zero_pte); -- cgit v1.2.3 From 6215d9f4470fbb48245ffdfade821685e2728c65 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 11 Feb 2026 12:31:40 +0200 Subject: arch, mm: consolidate empty_zero_page Reduce 22 declarations of empty_zero_page to 3 and 23 declarations of ZERO_PAGE() to 4. Every architecture defines empty_zero_page that way or another, but for the most of them it is always a page aligned page in BSS and most definitions of ZERO_PAGE do virt_to_page(empty_zero_page). Move Linus vetted x86 definition of empty_zero_page and ZERO_PAGE() to the core MM and drop these definitions in architectures that do not implement colored zero page (MIPS and s390). ZERO_PAGE() remains a macro because turning it to a wrapper for a static inline causes severe pain in header dependencies. For the most part the change is mechanical, with these being noteworthy: * alpha: aliased empty_zero_page with ZERO_PGE that was also used for boot parameters. Switching to a generic empty_zero_page removes the aliasing and keeps ZERO_PGE for boot parameters only * arm64: uses __pa_symbol() in ZERO_PAGE() so that definition of ZERO_PAGE() is kept intact. * m68k/parisc/um: allocated empty_zero_page from memblock, although they do not support zero page coloring and having it in BSS will work fine. * sparc64 can have empty_zero_page in BSS rather allocate it, but it can't use virt_to_page() for BSS. Keep it's definition of ZERO_PAGE() but instead of allocating it, make mem_map_zero point to empty_zero_page. * sh: used empty_zero_page for boot parameters at the very early boot. Rename the parameters page to boot_params_page and let sh use the generic empty_zero_page. * hexagon: had an amusing comment about empty_zero_page /* A handy thing to have if one has the RAM. Declared in head.S */ that unfortunately had to go :) Link: https://lkml.kernel.org/r/20260211103141.3215197-4-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Helge Deller [parisc] Tested-by: Helge Deller [parisc] Reviewed-by: Christophe Leroy (CS GROUP) Acked-by: Dave Hansen Acked-by: Catalin Marinas Acked-by: Magnus Lindholm [alpha] Acked-by: Dinh Nguyen [nios2] Acked-by: Andreas Larsson [sparc] Acked-by: David Hildenbrand (Arm) Acked-by: Liam R. Howlett Cc: "Borislav Petkov (AMD)" Cc: David S. Miller Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/alpha/include/asm/pgtable.h | 6 ------ arch/arc/include/asm/pgtable.h | 3 --- arch/arc/mm/init.c | 2 -- arch/arm/include/asm/pgtable.h | 9 --------- arch/arm/mm/mmu.c | 7 ------- arch/arm/mm/nommu.c | 7 ------- arch/arm64/include/asm/pgtable.h | 1 - arch/arm64/mm/mmu.c | 7 ------- arch/csky/include/asm/pgtable.h | 3 --- arch/csky/mm/init.c | 3 --- arch/hexagon/include/asm/pgtable.h | 6 ------ arch/hexagon/kernel/head.S | 5 ----- arch/hexagon/kernel/hexagon_ksyms.c | 1 - arch/loongarch/include/asm/pgtable.h | 9 --------- arch/loongarch/mm/init.c | 3 --- arch/m68k/include/asm/pgtable_mm.h | 9 --------- arch/m68k/include/asm/pgtable_no.h | 7 ------- arch/m68k/mm/init.c | 9 --------- arch/m68k/mm/mcfmmu.c | 2 -- arch/m68k/mm/motorola.c | 6 ------ arch/m68k/mm/sun3mmu.c | 2 -- arch/microblaze/include/asm/pgtable.h | 10 ---------- arch/microblaze/kernel/head.S | 4 ---- arch/microblaze/kernel/microblaze_ksyms.c | 2 -- arch/nios2/include/asm/pgtable.h | 7 ------- arch/nios2/kernel/head.S | 10 ---------- arch/nios2/kernel/nios2_ksyms.c | 1 - arch/openrisc/include/asm/pgtable.h | 4 ---- arch/openrisc/kernel/head.S | 3 --- arch/openrisc/kernel/or32_ksyms.c | 1 - arch/openrisc/mm/init.c | 3 --- arch/parisc/include/asm/pgtable.h | 11 ----------- arch/parisc/mm/init.c | 6 ------ arch/powerpc/include/asm/pgtable.h | 6 ------ arch/powerpc/mm/mem.c | 3 --- arch/riscv/include/asm/pgtable.h | 7 ------- arch/riscv/mm/init.c | 4 ---- arch/sh/include/asm/pgtable.h | 8 -------- arch/sh/include/asm/setup.h | 3 ++- arch/sh/kernel/head_32.S | 4 ++-- arch/sh/kernel/sh_ksyms_32.c | 1 - arch/sh/mm/init.c | 1 - arch/sparc/include/asm/pgtable_32.h | 8 -------- arch/sparc/include/asm/setup.h | 2 -- arch/sparc/kernel/head_32.S | 7 ------- arch/sparc/mm/init_32.c | 4 ---- arch/sparc/mm/init_64.c | 11 ++++------- arch/um/include/asm/pgtable.h | 9 --------- arch/um/include/shared/kern_util.h | 1 - arch/um/kernel/mem.c | 16 ---------------- arch/um/kernel/um_arch.c | 1 - arch/x86/include/asm/pgtable.h | 8 -------- arch/x86/kernel/head_32.S | 4 ---- arch/x86/kernel/head_64.S | 7 ------- arch/xtensa/include/asm/pgtable.h | 4 ---- arch/xtensa/kernel/head.S | 3 --- arch/xtensa/kernel/xtensa_ksyms.c | 2 -- include/linux/pgtable.h | 10 ++++++++++ mm/mm_init.c | 5 +++++ 59 files changed, 23 insertions(+), 285 deletions(-) (limited to 'include') diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h index e9368c54be45..268ddde33617 100644 --- a/arch/alpha/include/asm/pgtable.h +++ b/arch/alpha/include/asm/pgtable.h @@ -126,12 +126,6 @@ struct vm_area_struct; */ #define pgprot_noncached(prot) (prot) -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -#define ZERO_PAGE(vaddr) (virt_to_page(ZERO_PGE)) - /* * On certain platforms whose physical address space can overlap KSEG, * namely EV6 and above, we must re-twiddle the physaddr to restore the diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h index bd580e2b62d7..0fdaea81b5fa 100644 --- a/arch/arc/include/asm/pgtable.h +++ b/arch/arc/include/asm/pgtable.h @@ -21,9 +21,6 @@ #ifndef __ASSEMBLER__ -extern char empty_zero_page[PAGE_SIZE]; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) - extern pgd_t swapper_pg_dir[] __aligned(PAGE_SIZE); /* to cope with aliasing VIPT cache */ diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index a5e92f46e5d1..d6b5c27a0098 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -19,8 +19,6 @@ #include pgd_t swapper_pg_dir[PTRS_PER_PGD] __aligned(PAGE_SIZE); -char empty_zero_page[PAGE_SIZE] __aligned(PAGE_SIZE); -EXPORT_SYMBOL(empty_zero_page); static const unsigned long low_mem_start = CONFIG_LINUX_RAM_BASE; static unsigned long low_mem_sz; diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index 6fa9acd6a7f5..982795cf4563 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -10,15 +10,6 @@ #include #include -#ifndef __ASSEMBLY__ -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) -#endif - #include #ifndef CONFIG_MMU diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 518def8314e7..23b87b5ef7f1 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -41,13 +41,6 @@ extern unsigned long __atags_pointer; -/* - * empty_zero_page is a special page that is used for - * zero-initialized data and COW. - */ -unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; -EXPORT_SYMBOL(empty_zero_page); - /* * The pmd table for the upper-most set of pages. */ diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c index 7e42d8accec6..040ea43cce32 100644 --- a/arch/arm/mm/nommu.c +++ b/arch/arm/mm/nommu.c @@ -27,13 +27,6 @@ unsigned long vectors_base; -/* - * empty_zero_page is a special page that is used for - * zero-initialized data and COW. - */ -unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; -EXPORT_SYMBOL(empty_zero_page); - #ifdef CONFIG_ARM_MPU struct mpu_rgn_info mpu_rgn_info; #endif diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index b3e58735c49b..769570e43c18 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -110,7 +110,6 @@ static inline void arch_leave_lazy_mmu_mode(void) * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. */ -extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; #define ZERO_PAGE(vaddr) phys_to_page(__pa_symbol(empty_zero_page)) #define pte_ERROR(e) \ diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index a12ea8776c32..ec932f6ccddc 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -64,13 +64,6 @@ static bool rodata_is_rw __ro_after_init = true; */ long __section(".mmuoff.data.write") __early_cpu_boot_status; -/* - * Empty_zero_page is a special page that is used for zero-initialized data - * and COW. - */ -unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; -EXPORT_SYMBOL(empty_zero_page); - static DEFINE_SPINLOCK(swapper_pgdir_lock); static DEFINE_MUTEX(fixmap_lock); diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h index d606afbabce1..bafcd5823531 100644 --- a/arch/csky/include/asm/pgtable.h +++ b/arch/csky/include/asm/pgtable.h @@ -76,9 +76,6 @@ #define MAX_SWAPFILES_CHECK() \ BUILD_BUG_ON(MAX_SWAPFILES_SHIFT != 5) -extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) - extern void load_pgd(unsigned long pg_dir); extern pte_t invalid_pte_table[PTRS_PER_PTE]; diff --git a/arch/csky/mm/init.c b/arch/csky/mm/init.c index 573da66b2543..fa16015ea1c0 100644 --- a/arch/csky/mm/init.c +++ b/arch/csky/mm/init.c @@ -38,9 +38,6 @@ pte_t invalid_pte_table[PTRS_PER_PTE] __page_aligned_bss; pte_t kernel_pte_tables[PTRS_KERN_TABLE] __page_aligned_bss; EXPORT_SYMBOL(invalid_pte_table); -unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] - __page_aligned_bss; -EXPORT_SYMBOL(empty_zero_page); void free_initmem(void) { diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h index fbf24d1d1ca6..27b269e2870d 100644 --- a/arch/hexagon/include/asm/pgtable.h +++ b/arch/hexagon/include/asm/pgtable.h @@ -14,9 +14,6 @@ #include #include -/* A handy thing to have if one has the RAM. Declared in head.S */ -extern unsigned long empty_zero_page; - /* * The PTE model described here is that of the Hexagon Virtual Machine, * which autonomously walks 2-level page tables. At a lower level, we @@ -348,9 +345,6 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) return (unsigned long)__va(pmd_val(pmd) & PAGE_MASK); } -/* ZERO_PAGE - returns the globally shared zero page */ -#define ZERO_PAGE(vaddr) (virt_to_page(&empty_zero_page)) - /* * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that * are !pte_none() && !pte_present(). diff --git a/arch/hexagon/kernel/head.S b/arch/hexagon/kernel/head.S index 0b016308cc79..908ffece9132 100644 --- a/arch/hexagon/kernel/head.S +++ b/arch/hexagon/kernel/head.S @@ -216,8 +216,3 @@ __head_s_vaddr_target: .p2align PAGE_SHIFT ENTRY(external_cmdline_buffer) .fill _PAGE_SIZE,1,0 - -.data -.p2align PAGE_SHIFT -ENTRY(empty_zero_page) - .fill _PAGE_SIZE,1,0 diff --git a/arch/hexagon/kernel/hexagon_ksyms.c b/arch/hexagon/kernel/hexagon_ksyms.c index 36a80e31d187..81bc6f81e200 100644 --- a/arch/hexagon/kernel/hexagon_ksyms.c +++ b/arch/hexagon/kernel/hexagon_ksyms.c @@ -17,7 +17,6 @@ EXPORT_SYMBOL(raw_copy_to_user); EXPORT_SYMBOL(__vmgetie); EXPORT_SYMBOL(__vmsetie); EXPORT_SYMBOL(__vmyield); -EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memset); diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index c33b3bcb733e..a244de27a03e 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -74,15 +74,6 @@ struct mm_struct; struct vm_area_struct; -/* - * ZERO_PAGE is a global shared page that is always zero; used - * for zero-mapped memory areas etc.. - */ - -extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; - -#define ZERO_PAGE(vaddr) virt_to_page(empty_zero_page) - #ifdef CONFIG_32BIT #define VMALLOC_START (vm_map_base + PCI_IOSIZE + (2 * PAGE_SIZE)) diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index c331bf69d2ec..00f3822b6e47 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -36,9 +36,6 @@ #include #include -unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; -EXPORT_SYMBOL(empty_zero_page); - void copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { diff --git a/arch/m68k/include/asm/pgtable_mm.h b/arch/m68k/include/asm/pgtable_mm.h index bba64a9c49ac..7501ff030c63 100644 --- a/arch/m68k/include/asm/pgtable_mm.h +++ b/arch/m68k/include/asm/pgtable_mm.h @@ -110,15 +110,6 @@ extern unsigned long m68k_vmalloc_end; #define VMALLOC_END KMAP_START #endif -/* zero page used for uninitialized stuff */ -extern void *empty_zero_page; - -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) - extern void kernel_set_cachemode(void *addr, unsigned long size, int cmode); /* diff --git a/arch/m68k/include/asm/pgtable_no.h b/arch/m68k/include/asm/pgtable_no.h index 1a86c15b9008..11751807a3f3 100644 --- a/arch/m68k/include/asm/pgtable_no.h +++ b/arch/m68k/include/asm/pgtable_no.h @@ -30,13 +30,6 @@ #define swapper_pg_dir ((pgd_t *) 0) -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -extern void *empty_zero_page; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) - /* * All 32bit addresses are effectively valid for vmalloc... * Sort of meaningless for non-VM targets. diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index 53b71f786c27..3b88c0dd1616 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -33,13 +33,6 @@ #include #include -/* - * ZERO_PAGE is a special page that is used for zero-initialized - * data and COW. - */ -void *empty_zero_page; -EXPORT_SYMBOL(empty_zero_page); - void __init arch_zone_limits_init(unsigned long *max_zone_pfns) { max_zone_pfns[ZONE_DMA] = PFN_DOWN(memblock_end_of_DRAM()); @@ -71,8 +64,6 @@ void __init paging_init(void) unsigned long end_mem = memory_end & PAGE_MASK; high_memory = (void *) end_mem; - - empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); } #endif /* CONFIG_MMU */ diff --git a/arch/m68k/mm/mcfmmu.c b/arch/m68k/mm/mcfmmu.c index 3418fd864237..4924f2ff8ef8 100644 --- a/arch/m68k/mm/mcfmmu.c +++ b/arch/m68k/mm/mcfmmu.c @@ -41,8 +41,6 @@ void __init paging_init(void) unsigned long next_pgtable; int i; - empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); - pg_dir = swapper_pg_dir; memset(swapper_pg_dir, 0, sizeof(swapper_pg_dir)); diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c index 127a3fa69f4c..b30aa69a73a6 100644 --- a/arch/m68k/mm/motorola.c +++ b/arch/m68k/mm/motorola.c @@ -498,12 +498,6 @@ void __init paging_init(void) early_memtest(min_addr, max_addr); - /* - * initialize the bad page table and bad page to point - * to a couple of allocated pages - */ - empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); - /* * Set up SFC/DFC registers */ diff --git a/arch/m68k/mm/sun3mmu.c b/arch/m68k/mm/sun3mmu.c index c801677f7df8..f139cc15753a 100644 --- a/arch/m68k/mm/sun3mmu.c +++ b/arch/m68k/mm/sun3mmu.c @@ -43,8 +43,6 @@ void __init paging_init(void) unsigned long bootmem_end; unsigned long size; - empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); - address = PAGE_OFFSET; pg_dir = swapper_pg_dir; memset (swapper_pg_dir, 0, sizeof (swapper_pg_dir)); diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index 4eb76de6be4a..ea72291de553 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -207,16 +207,6 @@ extern pte_t *va_to_pte(unsigned long address); * Also, write permissions imply read permissions. */ -#ifndef __ASSEMBLER__ -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -extern unsigned long empty_zero_page[1024]; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) - -#endif /* __ASSEMBLER__ */ - #define pte_none(pte) ((pte_val(pte) & ~_PTE_NONE_MASK) == 0) #define pte_present(pte) (pte_val(pte) & _PAGE_PRESENT) #define pte_clear(mm, addr, ptep) \ diff --git a/arch/microblaze/kernel/head.S b/arch/microblaze/kernel/head.S index ec2fcb545e64..808019c3b7ac 100644 --- a/arch/microblaze/kernel/head.S +++ b/arch/microblaze/kernel/head.S @@ -39,10 +39,6 @@ #include .section .data -.global empty_zero_page -.align 12 -empty_zero_page: - .space PAGE_SIZE .global swapper_pg_dir swapper_pg_dir: .space PAGE_SIZE diff --git a/arch/microblaze/kernel/microblaze_ksyms.c b/arch/microblaze/kernel/microblaze_ksyms.c index a8553f54152b..ad7596d7ba07 100644 --- a/arch/microblaze/kernel/microblaze_ksyms.c +++ b/arch/microblaze/kernel/microblaze_ksyms.c @@ -33,8 +33,6 @@ EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memmove); #endif -EXPORT_SYMBOL(empty_zero_page); - EXPORT_SYMBOL(mbc); extern void __divsi3(void); diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index 844dce55569f..d389aa9ca57c 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -65,13 +65,6 @@ struct mm_struct; #define PGDIR_SIZE (1UL << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) - extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; extern pte_t invalid_pte_table[PAGE_SIZE/sizeof(pte_t)]; diff --git a/arch/nios2/kernel/head.S b/arch/nios2/kernel/head.S index 372ce4a33018..613212e1a63a 100644 --- a/arch/nios2/kernel/head.S +++ b/arch/nios2/kernel/head.S @@ -23,16 +23,6 @@ #include #include -/* - * ZERO_PAGE is a special page that is used for zero-initialized - * data and COW. - */ -.data -.global empty_zero_page -.align 12 -empty_zero_page: - .space PAGE_SIZE - /* * This global variable is used as an extension to the nios' * STATUS register to emulate a user/supervisor mode. diff --git a/arch/nios2/kernel/nios2_ksyms.c b/arch/nios2/kernel/nios2_ksyms.c index 54f7b23df1bf..c40aa39e8658 100644 --- a/arch/nios2/kernel/nios2_ksyms.c +++ b/arch/nios2/kernel/nios2_ksyms.c @@ -20,7 +20,6 @@ EXPORT_SYMBOL(memmove); /* memory management */ -EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(flush_icache_range); /* diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h index b218050e2f6d..6b89996d0b62 100644 --- a/arch/openrisc/include/asm/pgtable.h +++ b/arch/openrisc/include/asm/pgtable.h @@ -179,10 +179,6 @@ extern void paging_init(void); __pgprot(_PAGE_ALL | _PAGE_SRE | _PAGE_SWE \ | _PAGE_SHARED | _PAGE_DIRTY | _PAGE_EXEC | _PAGE_CI) -/* zero page used for uninitialized stuff */ -extern unsigned long empty_zero_page[2048]; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) - #define pte_none(x) (!pte_val(x)) #define pte_present(x) (pte_val(x) & _PAGE_PRESENT) #define pte_clear(mm, addr, xp) do { pte_val(*(xp)) = 0; } while (0) diff --git a/arch/openrisc/kernel/head.S b/arch/openrisc/kernel/head.S index bd760066f1cd..45890393947d 100644 --- a/arch/openrisc/kernel/head.S +++ b/arch/openrisc/kernel/head.S @@ -1563,9 +1563,6 @@ _string_nl: */ .section .data,"aw" .align 8192 - .global empty_zero_page -empty_zero_page: - .space 8192 .global swapper_pg_dir swapper_pg_dir: diff --git a/arch/openrisc/kernel/or32_ksyms.c b/arch/openrisc/kernel/or32_ksyms.c index 212e5f85004c..84a937a64e2a 100644 --- a/arch/openrisc/kernel/or32_ksyms.c +++ b/arch/openrisc/kernel/or32_ksyms.c @@ -40,7 +40,6 @@ DECLARE_EXPORT(__ashldi3); DECLARE_EXPORT(__lshrdi3); DECLARE_EXPORT(__ucmpdi2); -EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(__copy_tofrom_user); EXPORT_SYMBOL(__clear_user); EXPORT_SYMBOL(memset); diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index 78fb0734cdbc..89d8c6df8855 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -188,9 +188,6 @@ void __init mem_init(void) { BUG_ON(!mem_map); - /* clear the zero-page */ - memset((void *)empty_zero_page, 0, PAGE_SIZE); - printk("mem_init_done ...........................................\n"); mem_init_done = 1; return; diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 17afe7a59edf..f6fb99cb94d9 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -262,17 +262,6 @@ extern pgd_t swapper_pg_dir[]; /* declared in init_task.c */ extern pte_t pg0[]; -/* zero page used for uninitialized stuff */ - -extern unsigned long *empty_zero_page; - -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ - -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) - #define pte_none(x) (pte_val(x) == 0) #define pte_present(x) (pte_val(x) & _PAGE_PRESENT) #define pte_user(x) (pte_val(x) & _PAGE_USER) diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 6a39e031e5ff..be3380c9bcda 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -604,9 +604,6 @@ void __init mem_init(void) #endif } -unsigned long *empty_zero_page __ro_after_init; -EXPORT_SYMBOL(empty_zero_page); - /* * pagetable_init() sets up the page tables * @@ -639,9 +636,6 @@ static void __init pagetable_init(void) initrd_end - initrd_start, PAGE_KERNEL, 0); } #endif - - empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); - } static void __init gateway_init(void) diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index dcd3a88caaf6..b27d94c06d0e 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -76,12 +76,6 @@ static inline const void *pmd_page_vaddr(pmd_t pmd) } #define pmd_page_vaddr pmd_page_vaddr #endif -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -extern unsigned long empty_zero_page[]; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) extern pgd_t swapper_pg_dir[]; diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index b7982d0243d4..648d0c5602ec 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -38,9 +38,6 @@ unsigned long long memory_limit __initdata; -unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; -EXPORT_SYMBOL(empty_zero_page); - pgprot_t __phys_mem_access_prot(unsigned long pfn, unsigned long size, pgprot_t vma_prot) { diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 08d1ca047104..ab4ce1cc9d9c 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -1284,13 +1284,6 @@ extern u64 satp_mode; void paging_init(void); void misc_mem_init(void); -/* - * ZERO_PAGE is a global shared page that is always zero, - * used for zero-mapped memory areas, etc. - */ -extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) - /* * Use set_p*_safe(), and elide TLB flushing, when confident that *no* * TLB flush will be required as a result of the "set". For example, use diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 811e03786c56..017bad735d47 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -69,10 +69,6 @@ unsigned long vmemmap_start_pfn __ro_after_init; EXPORT_SYMBOL(vmemmap_start_pfn); #endif -unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] - __page_aligned_bss; -EXPORT_SYMBOL(empty_zero_page); - extern char _start[]; void *_dtb_early_va __initdata; uintptr_t _dtb_early_pa __initdata; diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h index 10fa8f2bb8d1..d5ce0950a323 100644 --- a/arch/sh/include/asm/pgtable.h +++ b/arch/sh/include/asm/pgtable.h @@ -20,14 +20,6 @@ #ifndef __ASSEMBLER__ #include #include - -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) - #endif /* !__ASSEMBLER__ */ /* diff --git a/arch/sh/include/asm/setup.h b/arch/sh/include/asm/setup.h index 84bb23a771f3..63c9efc06348 100644 --- a/arch/sh/include/asm/setup.h +++ b/arch/sh/include/asm/setup.h @@ -7,7 +7,8 @@ /* * This is set up by the setup-routine at boot-time */ -#define PARAM ((unsigned char *)empty_zero_page) +extern unsigned char *boot_params_page; +#define PARAM boot_params_page #define MOUNT_ROOT_RDONLY (*(unsigned long *) (PARAM+0x000)) #define RAMDISK_FLAGS (*(unsigned long *) (PARAM+0x004)) diff --git a/arch/sh/kernel/head_32.S b/arch/sh/kernel/head_32.S index b603b7968b38..0b91bb85d40a 100644 --- a/arch/sh/kernel/head_32.S +++ b/arch/sh/kernel/head_32.S @@ -26,7 +26,7 @@ #endif .section .empty_zero_page, "aw" -ENTRY(empty_zero_page) +ENTRY(boot_params_page) .long 1 /* MOUNT_ROOT_RDONLY */ .long 0 /* RAMDISK_FLAGS */ .long 0x0200 /* ORIG_ROOT_DEV */ @@ -39,7 +39,7 @@ ENTRY(empty_zero_page) .long 0x53453f00 + 29 /* "SE?" = 29 bit */ #endif 1: - .skip PAGE_SIZE - empty_zero_page - 1b + .skip PAGE_SIZE - boot_params_page - 1b __HEAD diff --git a/arch/sh/kernel/sh_ksyms_32.c b/arch/sh/kernel/sh_ksyms_32.c index 5858936cb431..041191002e2e 100644 --- a/arch/sh/kernel/sh_ksyms_32.c +++ b/arch/sh/kernel/sh_ksyms_32.c @@ -20,7 +20,6 @@ EXPORT_SYMBOL(csum_partial); EXPORT_SYMBOL(csum_partial_copy_generic); EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(__clear_user); -EXPORT_SYMBOL(empty_zero_page); #ifdef CONFIG_FLATMEM /* need in pfn_valid macro */ EXPORT_SYMBOL(min_low_pfn); diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 464a3a63e2fa..4e40d5e96be9 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -332,7 +332,6 @@ void __init mem_init(void) cpu_cache_init(); /* clear the zero-page */ - memset(empty_zero_page, 0, PAGE_SIZE); __flush_wback_region(empty_zero_page, PAGE_SIZE); vsyscall_init(); diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index a9f802d1dd64..f89b1250661d 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -71,14 +71,6 @@ extern unsigned long ptr_in_current_pgd; extern unsigned long phys_base; extern unsigned long pfn_base; -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; - -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) - /* * In general all page table modifications should use the V8 atomic * swap instruction. This insures the mmu and the cpu are in sync diff --git a/arch/sparc/include/asm/setup.h b/arch/sparc/include/asm/setup.h index 72205684e51e..21bed5514028 100644 --- a/arch/sparc/include/asm/setup.h +++ b/arch/sparc/include/asm/setup.h @@ -17,8 +17,6 @@ extern char reboot_command[]; */ extern unsigned char boot_cpu_id; -extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; - extern int serial_console; static inline int con_is_present(void) { diff --git a/arch/sparc/kernel/head_32.S b/arch/sparc/kernel/head_32.S index 38345460d542..8c320fa25a67 100644 --- a/arch/sparc/kernel/head_32.S +++ b/arch/sparc/kernel/head_32.S @@ -57,13 +57,6 @@ sun4e_notsup: .align PAGE_SIZE -/* This was the only reasonable way I could think of to properly align - * these page-table data structures. - */ - .globl empty_zero_page -empty_zero_page: .skip PAGE_SIZE -EXPORT_SYMBOL(empty_zero_page) - .global root_flags .global ram_flags .global root_dev diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index fdc93dd12c3e..e0e66f91ceeb 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -246,10 +246,6 @@ void __init arch_mm_preinit(void) prom_halt(); } - - /* Saves us work later. */ - memset((void *)empty_zero_page, 0, PAGE_SIZE); - i = last_valid_pfn >> ((20 - PAGE_SHIFT) + 5); i += 1; sparc_valid_addr_bitmap = (unsigned long *) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index f46394c46a76..748790998ff5 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2492,6 +2492,9 @@ static void __init register_page_bootmem_info(void) } void __init mem_init(void) { + phys_addr_t zero_page_pa = kern_base + + ((unsigned long)&empty_zero_page[0] - KERNBASE); + /* * Must be done after boot memory is put on freelist, because here we * might set fields in deferred struct pages that have not yet been @@ -2504,13 +2507,7 @@ void __init mem_init(void) * Set up the zero page, mark it reserved, so that page count * is not manipulated when freeing the page from user ptes. */ - mem_map_zero = alloc_pages(GFP_KERNEL|__GFP_ZERO, 0); - if (mem_map_zero == NULL) { - prom_printf("paging_init: Cannot alloc zero page.\n"); - prom_halt(); - } - mark_page_reserved(mem_map_zero); - + mem_map_zero = pfn_to_page(PHYS_PFN(zero_page_pa)); if (tlb_type == cheetah || tlb_type == cheetah_plus) cheetah_ecache_flush_init(); diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index 3b42b0f45bf6..19e0608fb649 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h @@ -34,9 +34,6 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; -/* zero page used for uninitialized stuff */ -extern unsigned long *empty_zero_page; - /* Just any arbitrary offset to the start of the vmalloc VM area: the * current 8MB value just means that there will be a 8MB "hole" after the * physical memory until the kernel virtual memory starts. That means that @@ -74,12 +71,6 @@ extern unsigned long *empty_zero_page; * get.. */ -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -#define ZERO_PAGE(vaddr) virt_to_page(empty_zero_page) - #define pte_clear(mm, addr, xp) pte_set_val(*(xp), (phys_t) 0, __pgprot(_PAGE_NEEDSYNC)) #define pmd_none(x) (!((unsigned long)pmd_val(x) & ~_PAGE_NEEDSYNC)) diff --git a/arch/um/include/shared/kern_util.h b/arch/um/include/shared/kern_util.h index 38321188c04c..9812efd14ec0 100644 --- a/arch/um/include/shared/kern_util.h +++ b/arch/um/include/shared/kern_util.h @@ -38,7 +38,6 @@ extern void timer_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs extern void uml_pm_wake(void); extern int start_uml(void); -extern void paging_init(void); extern void uml_cleanup(void); extern void do_uml_exitcalls(void); diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 89c8c8b94a79..1eef0e42ef5d 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -44,10 +44,6 @@ __section(".kasan_init") __used = kasan_init; #endif -/* allocated in paging_init, zeroed in mem_init, and unchanged thereafter */ -unsigned long *empty_zero_page = NULL; -EXPORT_SYMBOL(empty_zero_page); - /* * Initialized during boot, and readonly for initializing page tables * afterwards @@ -65,9 +61,6 @@ void __init arch_mm_preinit(void) /* Safe to call after jump_label_init(). Enables KASAN. */ kasan_init_generic(); - /* clear the zero-page */ - memset(empty_zero_page, 0, PAGE_SIZE); - /* Map in the area just after the brk now that kmalloc is about * to be turned on. */ @@ -89,15 +82,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) max_zone_pfns[ZONE_NORMAL] = high_physmem >> PAGE_SHIFT; } -void __init paging_init(void) -{ - empty_zero_page = (unsigned long *) memblock_alloc_low(PAGE_SIZE, - PAGE_SIZE); - if (!empty_zero_page) - panic("%s: Failed to allocate %lu bytes align=%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); -} - /* * This can't do anything because nothing in the kernel image can be freed * since it's not in kernel physical memory. diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index e2b24e1ecfa6..2141f5f1f5a2 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -413,7 +413,6 @@ void __init setup_arch(char **cmdline_p) uml_dtb_init(); read_initrd(); - paging_init(); strscpy(boot_command_line, command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; setup_hostinfo(host_info, sizeof host_info); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1662c5a8f445..54289f4587a4 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -47,14 +47,6 @@ void ptdump_walk_user_pgd_level_checkwx(void); #define debug_checkwx_user() do { } while (0) #endif -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] - __visible; -#define ZERO_PAGE(vaddr) ((void)(vaddr),virt_to_page(empty_zero_page)) - extern spinlock_t pgd_lock; extern struct list_head pgd_list; diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 80ef5d386b03..5171cb746444 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -441,10 +441,6 @@ initial_pg_fixmap: swapper_pg_dir: .fill 1024,4,0 .fill PTI_USER_PGD_FILL,4,0 -.globl empty_zero_page -empty_zero_page: - .fill 4096,1,0 -EXPORT_SYMBOL(empty_zero_page) /* * This starts the data section. diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 85d4a5094f6b..7ed5520dd52e 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -684,10 +684,3 @@ SYM_PIC_ALIAS(phys_base); EXPORT_SYMBOL(phys_base) #include "../xen/xen-head.S" - - __PAGE_ALIGNED_BSS -SYM_DATA_START_PAGE_ALIGNED(empty_zero_page) - .skip PAGE_SIZE -SYM_DATA_END(empty_zero_page) -EXPORT_SYMBOL(empty_zero_page) - diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index 50a136213b2b..61f07d981a94 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h @@ -209,10 +209,6 @@ #define pgd_ERROR(e) \ printk("%s:%d: bad pgd entry %08lx.\n", __FILE__, __LINE__, pgd_val(e)) -extern unsigned long empty_zero_page[1024]; - -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) - #ifdef CONFIG_MMU extern pgd_t swapper_pg_dir[PAGE_SIZE/sizeof(pgd_t)]; extern void paging_init(void); diff --git a/arch/xtensa/kernel/head.S b/arch/xtensa/kernel/head.S index 8484294bc623..4b0c5c5e685a 100644 --- a/arch/xtensa/kernel/head.S +++ b/arch/xtensa/kernel/head.S @@ -381,6 +381,3 @@ ENTRY(swapper_pg_dir) .fill PAGE_SIZE, 1, 0 END(swapper_pg_dir) #endif -ENTRY(empty_zero_page) - .fill PAGE_SIZE, 1, 0 -END(empty_zero_page) diff --git a/arch/xtensa/kernel/xtensa_ksyms.c b/arch/xtensa/kernel/xtensa_ksyms.c index 62d81e76e18e..ced335b4df5f 100644 --- a/arch/xtensa/kernel/xtensa_ksyms.c +++ b/arch/xtensa/kernel/xtensa_ksyms.c @@ -15,8 +15,6 @@ #include #include -EXPORT_SYMBOL(empty_zero_page); - unsigned int __sync_fetch_and_and_4(volatile void *p, unsigned int v) { BUG(); diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index c3a56f6b1ea5..2a05c3885f85 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1925,6 +1925,9 @@ static inline void pfnmap_setup_cachemode_pfn(unsigned long pfn, pgprot_t *prot) * for different ranges in the virtual address space. * * zero_page_pfn identifies the first (or the only) pfn for these pages. + * + * For architectures that don't __HAVE_COLOR_ZERO_PAGE the zero page lives in + * empty_zero_page in BSS. */ #ifdef __HAVE_COLOR_ZERO_PAGE static inline int is_zero_pfn(unsigned long pfn) @@ -1951,6 +1954,13 @@ static inline unsigned long zero_pfn(unsigned long addr) return zero_page_pfn; } + +extern uint8_t empty_zero_page[PAGE_SIZE]; + +#ifndef ZERO_PAGE +#define ZERO_PAGE(vaddr) ((void)(vaddr),virt_to_page(empty_zero_page)) +#endif + #endif /* __HAVE_COLOR_ZERO_PAGE */ #ifdef CONFIG_MMU diff --git a/mm/mm_init.c b/mm/mm_init.c index ab6578516dd6..a0472d496c91 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -56,6 +56,11 @@ EXPORT_SYMBOL(high_memory); unsigned long zero_page_pfn __ro_after_init; EXPORT_SYMBOL(zero_page_pfn); +#ifndef __HAVE_COLOR_ZERO_PAGE +uint8_t empty_zero_page[PAGE_SIZE] __page_aligned_bss; +EXPORT_SYMBOL(empty_zero_page); +#endif + #ifdef CONFIG_DEBUG_MEMORY_INIT int __meminitdata mminit_loglevel; -- cgit v1.2.3 From 26513781d1b3a1e8b4b576ed62751d604a69b374 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 11 Feb 2026 12:31:41 +0200 Subject: mm: cache struct page for empty_zero_page and return it from ZERO_PAGE() For most architectures every invocation of ZERO_PAGE() does virt_to_page(empty_zero_page). But empty_zero_page is in BSS and it is enough to get its struct page once at initialization time and then use it whenever a zero page should be accessed. Add yet another __zero_page variable that will be initialized as virt_to_page(empty_zero_page) for most architectures in a weak arch_setup_zero_pages() function. For architectures that use colored zero pages (MIPS and s390) rename their setup_zero_pages() to arch_setup_zero_pages() and make it global rather than static. For architectures that cannot use virt_to_page() for BSS (arm64 and sparc64) add override of arch_setup_zero_pages(). Link: https://lkml.kernel.org/r/20260211103141.3215197-5-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Catalin Marinas Acked-by: David Hildenbrand (Arm) Acked-by: Liam R. Howlett Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Christophe Leroy (CS GROUP) Cc: Dave Hansen Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 6 ------ arch/arm64/mm/init.c | 5 +++++ arch/mips/mm/init.c | 11 +---------- arch/s390/mm/init.c | 4 +--- arch/sparc/include/asm/pgtable_64.h | 3 --- arch/sparc/mm/init_64.c | 17 +++++++---------- include/linux/pgtable.h | 11 ++++++++--- mm/mm_init.c | 21 +++++++++++++++++---- 8 files changed, 39 insertions(+), 39 deletions(-) (limited to 'include') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 769570e43c18..aa4b13da6371 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -106,12 +106,6 @@ static inline void arch_leave_lazy_mmu_mode(void) #define flush_tlb_fix_spurious_fault_pmd(vma, address, pmdp) \ local_flush_tlb_page_nonotify(vma, address) -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -#define ZERO_PAGE(vaddr) phys_to_page(__pa_symbol(empty_zero_page)) - #define pte_ERROR(e) \ pr_err("%s:%d: bad pte %016llx.\n", __FILE__, __LINE__, pte_val(e)) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 96711b8578fd..417ec7efe569 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -328,6 +328,11 @@ void __init bootmem_init(void) memblock_dump_all(); } +void __init arch_setup_zero_pages(void) +{ + __zero_page = phys_to_page(__pa_symbol(empty_zero_page)); +} + void __init arch_mm_preinit(void) { unsigned int flags = SWIOTLB_VERBOSE; diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 4f6449ad02ca..55b25e85122a 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -56,10 +56,7 @@ unsigned long empty_zero_page, zero_page_mask; EXPORT_SYMBOL_GPL(empty_zero_page); EXPORT_SYMBOL(zero_page_mask); -/* - * Not static inline because used by IP27 special magic initialization code - */ -static void __init setup_zero_pages(void) +void __init arch_setup_zero_pages(void) { unsigned int order; @@ -450,7 +447,6 @@ void __init arch_mm_preinit(void) BUILD_BUG_ON(IS_ENABLED(CONFIG_32BIT) && (PFN_PTE_SHIFT > PAGE_SHIFT)); maar_init(); - setup_zero_pages(); /* Setup zeroed pages. */ highmem_init(); #ifdef CONFIG_64BIT @@ -461,11 +457,6 @@ void __init arch_mm_preinit(void) 0x80000000 - 4, KCORE_TEXT); #endif } -#else /* CONFIG_NUMA */ -void __init arch_mm_preinit(void) -{ - setup_zero_pages(); /* This comes from node 0 */ -} #endif /* !CONFIG_NUMA */ void free_init_pages(const char *what, unsigned long begin, unsigned long end) diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 3c20475cbee2..1f72efc2a579 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -69,7 +69,7 @@ unsigned long empty_zero_page, zero_page_mask; EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(zero_page_mask); -static void __init setup_zero_pages(void) +void __init arch_setup_zero_pages(void) { unsigned long total_pages = memblock_estimated_nr_free_pages(); unsigned int order; @@ -159,8 +159,6 @@ void __init arch_mm_preinit(void) cpumask_set_cpu(0, mm_cpumask(&init_mm)); pv_init(); - - setup_zero_pages(); /* Setup zeroed pages. */ } unsigned long memory_block_size_bytes(void) diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 615f460c50af..74ede706fb32 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -210,9 +210,6 @@ extern unsigned long _PAGE_CACHE; extern unsigned long pg_iobits; extern unsigned long _PAGE_ALL_SZ_BITS; -extern struct page *mem_map_zero; -#define ZERO_PAGE(vaddr) (mem_map_zero) - /* PFNs are real physical page numbers. However, mem_map only begins to record * per-page information starting at pfn_base. This is to handle systems where * the first physical page in the machine is at some huge physical address, diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 748790998ff5..3aa47f2b6c6e 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -177,9 +177,6 @@ extern unsigned long sparc_ramdisk_image64; extern unsigned int sparc_ramdisk_image; extern unsigned int sparc_ramdisk_size; -struct page *mem_map_zero __read_mostly; -EXPORT_SYMBOL(mem_map_zero); - unsigned int sparc64_highest_unlocked_tlb_ent __read_mostly; unsigned long sparc64_kern_pri_context __read_mostly; @@ -2490,11 +2487,17 @@ static void __init register_page_bootmem_info(void) register_page_bootmem_info_node(NODE_DATA(i)); #endif } -void __init mem_init(void) + +void __init arch_setup_zero_pages(void) { phys_addr_t zero_page_pa = kern_base + ((unsigned long)&empty_zero_page[0] - KERNBASE); + __zero_page = phys_to_page(zero_page_pa); +} + +void __init mem_init(void) +{ /* * Must be done after boot memory is put on freelist, because here we * might set fields in deferred struct pages that have not yet been @@ -2503,12 +2506,6 @@ void __init mem_init(void) */ register_page_bootmem_info(); - /* - * Set up the zero page, mark it reserved, so that page count - * is not manipulated when freeing the page from user ptes. - */ - mem_map_zero = pfn_to_page(PHYS_PFN(zero_page_pa)); - if (tlb_type == cheetah || tlb_type == cheetah_plus) cheetah_ecache_flush_init(); } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 2a05c3885f85..776993d4567b 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1929,6 +1929,8 @@ static inline void pfnmap_setup_cachemode_pfn(unsigned long pfn, pgprot_t *prot) * For architectures that don't __HAVE_COLOR_ZERO_PAGE the zero page lives in * empty_zero_page in BSS. */ +void arch_setup_zero_pages(void); + #ifdef __HAVE_COLOR_ZERO_PAGE static inline int is_zero_pfn(unsigned long pfn) { @@ -1956,10 +1958,13 @@ static inline unsigned long zero_pfn(unsigned long addr) } extern uint8_t empty_zero_page[PAGE_SIZE]; +extern struct page *__zero_page; -#ifndef ZERO_PAGE -#define ZERO_PAGE(vaddr) ((void)(vaddr),virt_to_page(empty_zero_page)) -#endif +static inline struct page *_zero_page(unsigned long addr) +{ + return __zero_page; +} +#define ZERO_PAGE(vaddr) _zero_page(vaddr) #endif /* __HAVE_COLOR_ZERO_PAGE */ diff --git a/mm/mm_init.c b/mm/mm_init.c index a0472d496c91..f903747ca854 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -59,7 +59,10 @@ EXPORT_SYMBOL(zero_page_pfn); #ifndef __HAVE_COLOR_ZERO_PAGE uint8_t empty_zero_page[PAGE_SIZE] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); -#endif + +struct page *__zero_page __ro_after_init; +EXPORT_SYMBOL(__zero_page); +#endif /* __HAVE_COLOR_ZERO_PAGE */ #ifdef CONFIG_DEBUG_MEMORY_INIT int __meminitdata mminit_loglevel; @@ -2680,12 +2683,21 @@ static void __init mem_init_print_info(void) ); } -static int __init init_zero_page_pfn(void) +#ifndef __HAVE_COLOR_ZERO_PAGE +/* + * architectures that __HAVE_COLOR_ZERO_PAGE must define this function + */ +void __init __weak arch_setup_zero_pages(void) +{ + __zero_page = virt_to_page(empty_zero_page); +} +#endif + +static void __init init_zero_page_pfn(void) { + arch_setup_zero_pages(); zero_page_pfn = page_to_pfn(ZERO_PAGE(0)); - return 0; } -early_initcall(init_zero_page_pfn); void __init __weak arch_mm_preinit(void) { @@ -2709,6 +2721,7 @@ void __init mm_core_init_early(void) void __init mm_core_init(void) { arch_mm_preinit(); + init_zero_page_pfn(); /* Initializations relying on SMP setup */ BUILD_BUG_ON(MAX_ZONELISTS > 2); -- cgit v1.2.3 From 36cec70e4acbae21e39527c1d41083bca148c7c8 Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Sat, 21 Feb 2026 17:39:15 +0800 Subject: mm: khugepaged: add trace_mm_khugepaged_scan event Patch series "Improve khugepaged scan logic", v8. This series improves the khugepaged scan logic and reduces CPU consumption by prioritizing scanning tasks that access memory frequently. The following data is traced by bpftrace[1] on a desktop system. After the system has been left idle for 10 minutes upon booting, a lot of SCAN_PMD_MAPPED or SCAN_NO_PTE_TABLE are observed during a full scan by khugepaged. @scan_pmd_status[1]: 1 ## SCAN_SUCCEED @scan_pmd_status[6]: 2 ## SCAN_EXCEED_SHARED_PTE @scan_pmd_status[3]: 142 ## SCAN_PMD_MAPPED @scan_pmd_status[2]: 178 ## SCAN_NO_PTE_TABLE total progress size: 674 MB Total time : 419 seconds ## include khugepaged_scan_sleep_millisecs The khugepaged has below phenomenon: the khugepaged list is scanned in a FIFO manner, as long as the task is not destroyed, 1. the task no longer has memory that can be collapsed into hugepage, continues scan it always. 2. the task at the front of the khugepaged scan list is cold, they are still scanned first. 3. everyone scan at intervals of khugepaged_scan_sleep_millisecs (default 10s). If we always scan the above two cases first, the valid scan will have to wait for a long time. For the first case, when the memory is either SCAN_PMD_MAPPED or SCAN_NO_PTE_TABLE or SCAN_PTE_MAPPED_HUGEPAGE [5], just skip it. For the second case, if the user has explicitly informed us via MADV_FREE that these folios will be freed, just skip it only. The below is some performance test results. kernbench results (testing on x86_64 machine): baseline w/o patches test w/ patches Amean user-32 18522.51 ( 0.00%) 18333.64 * 1.02%* Amean syst-32 1137.96 ( 0.00%) 1113.79 * 2.12%* Amean elsp-32 666.04 ( 0.00%) 659.44 * 0.99%* BAmean-95 user-32 18520.01 ( 0.00%) 18323.57 ( 1.06%) BAmean-95 syst-32 1137.68 ( 0.00%) 1110.50 ( 2.39%) BAmean-95 elsp-32 665.92 ( 0.00%) 659.06 ( 1.03%) BAmean-99 user-32 18520.01 ( 0.00%) 18323.57 ( 1.06%) BAmean-99 syst-32 1137.68 ( 0.00%) 1110.50 ( 2.39%) BAmean-99 elsp-32 665.92 ( 0.00%) 659.06 ( 1.03%) Create three task[2]: hot1 -> cold -> hot2. After all three task are created, each allocate memory 128MB. the hot1/hot2 task continuously access 128 MB memory, while the cold task only accesses its memory briefly andthen call madvise(MADV_FREE). Here are the performance test results: (Throughput bigger is better, other smaller is better) Testing on x86_64 machine: | task hot2 | without patch | with patch | delta | |---------------------|---------------|---------------|---------| | total accesses time | 3.14 sec | 2.93 sec | -6.69% | | cycles per access | 4.96 | 2.21 | -55.44% | | Throughput | 104.38 M/sec | 111.89 M/sec | +7.19% | | dTLB-load-misses | 284814532 | 69597236 | -75.56% | Testing on qemu-system-x86_64 -enable-kvm: | task hot2 | without patch | with patch | delta | |---------------------|---------------|---------------|---------| | total accesses time | 3.35 sec | 2.96 sec | -11.64% | | cycles per access | 7.29 | 2.07 | -71.60% | | Throughput | 97.67 M/sec | 110.77 M/sec | +13.41% | | dTLB-load-misses | 241600871 | 3216108 | -98.67% | This patch (of 4): Add mm_khugepaged_scan event to track the total time for full scan and the total number of pages scanned of khugepaged. Link: https://lkml.kernel.org/r/20260221093918.1456187-2-vernon2gm@gmail.com Signed-off-by: Vernon Yang Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Barry Song Reviewed-by: Lance Yang Reviewed-by: Dev Jain Cc: Baolin Wang Cc: Dev Jain Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Nico Pache Cc: Ryan Roberts Cc: Steven Rostedt Cc: Zi Yan Signed-off-by: Andrew Morton --- include/trace/events/huge_memory.h | 25 +++++++++++++++++++++++++ mm/khugepaged.c | 2 ++ 2 files changed, 27 insertions(+) (limited to 'include') diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 4e41bff31888..384e29f6bef0 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -237,5 +237,30 @@ TRACE_EVENT(mm_khugepaged_collapse_file, __print_symbolic(__entry->result, SCAN_STATUS)) ); +TRACE_EVENT(mm_khugepaged_scan, + + TP_PROTO(struct mm_struct *mm, unsigned int progress, + bool full_scan_finished), + + TP_ARGS(mm, progress, full_scan_finished), + + TP_STRUCT__entry( + __field(struct mm_struct *, mm) + __field(unsigned int, progress) + __field(bool, full_scan_finished) + ), + + TP_fast_assign( + __entry->mm = mm; + __entry->progress = progress; + __entry->full_scan_finished = full_scan_finished; + ), + + TP_printk("mm=%p, progress=%u, full_scan_finished=%d", + __entry->mm, + __entry->progress, + __entry->full_scan_finished) +); + #endif /* __HUGE_MEMORY_H */ #include diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 17ab58681032..4d7baf220ad9 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2527,6 +2527,8 @@ breakouterloop_mmap_lock: collect_mm_slot(slot); } + trace_mm_khugepaged_scan(mm, progress, khugepaged_scan.mm_slot == NULL); + return progress; } -- cgit v1.2.3 From 6cc153f90b7cf07db2b49469dfd79141b145036a Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Sat, 21 Feb 2026 17:39:17 +0800 Subject: mm: add folio_test_lazyfree helper Add folio_test_lazyfree() function to identify lazy-free folios to improve code readability. Link: https://lkml.kernel.org/r/20260221093918.1456187-4-vernon2gm@gmail.com Signed-off-by: Vernon Yang Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Lance Yang Reviewed-by: Dev Jain Reviewed-by: Barry Song Cc: Baolin Wang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Nico Pache Cc: Ryan Roberts Cc: Steven Rostedt Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 5 +++++ mm/rmap.c | 2 +- mm/vmscan.c | 5 ++--- 3 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index f7a0e4af0c73..415e9f2ef616 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -724,6 +724,11 @@ static __always_inline bool folio_test_anon(const struct folio *folio) return ((unsigned long)folio->mapping & FOLIO_MAPPING_ANON) != 0; } +static __always_inline bool folio_test_lazyfree(const struct folio *folio) +{ + return folio_test_anon(folio) && !folio_test_swapbacked(folio); +} + static __always_inline bool PageAnonNotKsm(const struct page *page) { unsigned long flags = (unsigned long)page_folio(page)->mapping; diff --git a/mm/rmap.c b/mm/rmap.c index 8f08090d7eb9..5fd22ba59d35 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2060,7 +2060,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, } if (!pvmw.pte) { - if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) { + if (folio_test_lazyfree(folio)) { if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, folio)) goto walk_done; /* diff --git a/mm/vmscan.c b/mm/vmscan.c index 031c5c035a82..d531040a3593 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -963,8 +963,7 @@ static void folio_check_dirty_writeback(struct folio *folio, * They could be mistakenly treated as file lru. So further anon * test is needed. */ - if (!folio_is_file_lru(folio) || - (folio_test_anon(folio) && !folio_test_swapbacked(folio))) { + if (!folio_is_file_lru(folio) || folio_test_lazyfree(folio)) { *dirty = false; *writeback = false; return; @@ -1508,7 +1507,7 @@ retry: } } - if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) { + if (folio_test_lazyfree(folio)) { /* follow __remove_mapping for reference */ if (!folio_ref_freeze(folio, 1)) goto keep_locked; -- cgit v1.2.3 From 05620419776ab07f1d057bdca5be846f263df1fd Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Sat, 21 Feb 2026 17:39:18 +0800 Subject: mm: khugepaged: skip lazy-free folios For example, create three task: hot1 -> cold -> hot2. After all three task are created, each allocate memory 128MB. the hot1/hot2 task continuously access 128 MB memory, while the cold task only accesses its memory briefly and then call madvise(MADV_FREE). However, khugepaged still prioritizes scanning the cold task and only scans the hot2 task after completing the scan of the cold task. All folios in VM_DROPPABLE are lazyfree, Collapsing maintains that property, so we can just collapse and memory pressure in the future will free it up. In contrast, collapsing in !VM_DROPPABLE does not maintain that property, the collapsed folio will not be lazyfree and memory pressure in the future will not be able to free it up. So if the user has explicitly informed us via MADV_FREE that this memory will be freed, and this vma does not have VM_DROPPABLE flags, it is appropriate for khugepaged to skip it only, thereby avoiding unnecessary scan and collapse operations to reducing CPU wastage. Here are the performance test results: (Throughput bigger is better, other smaller is better) Testing on x86_64 machine: | task hot2 | without patch | with patch | delta | |---------------------|---------------|---------------|---------| | total accesses time | 3.14 sec | 2.93 sec | -6.69% | | cycles per access | 4.96 | 2.21 | -55.44% | | Throughput | 104.38 M/sec | 111.89 M/sec | +7.19% | | dTLB-load-misses | 284814532 | 69597236 | -75.56% | Testing on qemu-system-x86_64 -enable-kvm: | task hot2 | without patch | with patch | delta | |---------------------|---------------|---------------|---------| | total accesses time | 3.35 sec | 2.96 sec | -11.64% | | cycles per access | 7.29 | 2.07 | -71.60% | | Throughput | 97.67 M/sec | 110.77 M/sec | +13.41% | | dTLB-load-misses | 241600871 | 3216108 | -98.67% | [vernon2gm@gmail.com: add comment about VM_DROPPABLE in code, make it clearer] Link: https://lkml.kernel.org/r/i4uowkt4h2ev47obm5h2vtd4zbk6fyw5g364up7kkjn2vmcikq@auepvqethj5r Link: https://lkml.kernel.org/r/20260221093918.1456187-5-vernon2gm@gmail.com Signed-off-by: Vernon Yang Acked-by: David Hildenbrand (arm) Reviewed-by: Lance Yang Reviewed-by: Barry Song Cc: Baolin Wang Cc: Dev Jain Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Nico Pache Cc: Ryan Roberts Cc: Steven Rostedt Cc: Zi Yan Signed-off-by: Andrew Morton --- include/trace/events/huge_memory.h | 1 + mm/khugepaged.c | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) (limited to 'include') diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 384e29f6bef0..bcdc57eea270 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -25,6 +25,7 @@ EM( SCAN_PAGE_LRU, "page_not_in_lru") \ EM( SCAN_PAGE_LOCK, "page_locked") \ EM( SCAN_PAGE_ANON, "page_not_anon") \ + EM( SCAN_PAGE_LAZYFREE, "page_lazyfree") \ EM( SCAN_PAGE_COMPOUND, "page_compound") \ EM( SCAN_ANY_PROCESS, "no_process_for_page") \ EM( SCAN_VMA_NULL, "vma_null") \ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 8ee3c44bc851..13b0fe50dfc5 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -46,6 +46,7 @@ enum scan_result { SCAN_PAGE_LRU, SCAN_PAGE_LOCK, SCAN_PAGE_ANON, + SCAN_PAGE_LAZYFREE, SCAN_PAGE_COMPOUND, SCAN_ANY_PROCESS, SCAN_VMA_NULL, @@ -577,6 +578,16 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma, folio = page_folio(page); VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio); + /* + * If the vma has the VM_DROPPABLE flag, the collapse will + * preserve the lazyfree property without needing to skip. + */ + if (cc->is_khugepaged && !(vma->vm_flags & VM_DROPPABLE) && + folio_test_lazyfree(folio) && !pte_dirty(pteval)) { + result = SCAN_PAGE_LAZYFREE; + goto out; + } + /* See hpage_collapse_scan_pmd(). */ if (folio_maybe_mapped_shared(folio)) { ++shared; @@ -1325,6 +1336,16 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm, } folio = page_folio(page); + /* + * If the vma has the VM_DROPPABLE flag, the collapse will + * preserve the lazyfree property without needing to skip. + */ + if (cc->is_khugepaged && !(vma->vm_flags & VM_DROPPABLE) && + folio_test_lazyfree(folio) && !pte_dirty(pteval)) { + result = SCAN_PAGE_LAZYFREE; + goto out_unmap; + } + if (!folio_test_anon(folio)) { result = SCAN_PAGE_ANON; goto out_unmap; -- cgit v1.2.3 From 3f2ad90060f65d6f66414b8a67c569154bafec7b Mon Sep 17 00:00:00 2001 From: Jason Miu Date: Thu, 5 Feb 2026 18:14:27 -0800 Subject: kho: adopt radix tree for preserved memory tracking Patch series "Make KHO Stateless", v9. This series transitions KHO from an xarray-based metadata tracking system with serialization to a radix tree data structure that can be passed directly to the next kernel. The key motivations for this change are to: - Eliminate the need for data serialization before kexec. - Remove the KHO finalize state. - Pass preservation metadata more directly to the next kernel via the FDT. The new approach uses a radix tree to mark preserved pages. A page's physical address and its order are encoded into a single value. The tree is composed of multiple levels of page-sized tables, with leaf nodes being bitmaps where each set bit represents a preserved page. The physical address of the radix tree's root is passed in the FDT, allowing the next kernel to reconstruct the preserved memory map. This series is broken down into the following patches: 1. kho: Adopt radix tree for preserved memory tracking: Replaces the xarray-based tracker with the new radix tree implementation and increments the ABI version. 2. kho: Remove finalize state and clients: Removes the now-obsolete kho_finalize() function and its usage from client code and debugfs. This patch (of 2): Introduce a radix tree implementation for tracking preserved memory pages and switch the KHO memory tracking mechanism to use it. This lays the groundwork for a stateless KHO implementation that eliminates the need for serialization and the associated "finalize" state. This patch introduces the core radix tree data structures and constants to the KHO ABI. It adds the radix tree node and leaf structures, along with documentation for the radix tree key encoding scheme that combines a page's physical address and order. To support broader use by other kernel subsystems, such as hugetlb preservation, the core radix tree manipulation functions are exported as a public API. The xarray-based memory tracking is replaced with this new radix tree implementation. The core KHO preservation and unpreservation functions are wired up to use the radix tree helpers. On boot, the second kernel restores the preserved memory map by walking the radix tree whose root physical address is passed via the FDT. The ABI `compatible` version is bumped to "kho-v2" to reflect the structural changes in the preserved memory map and sub-FDT property names. This includes renaming "fdt" to "preserved-data" to better reflect that preserved state may use formats other than FDT. [ran.xiaokai@zte.com.cn: fix child node parsing for debugfs in/sub_fdts] Link: https://lkml.kernel.org/r/20260309033530.244508-1-ranxiaokai627@163.com Link: https://lkml.kernel.org/r/20260206021428.3386442-1-jasonmiu@google.com Link: https://lkml.kernel.org/r/20260206021428.3386442-2-jasonmiu@google.com Signed-off-by: Jason Miu Signed-off-by: Ran Xiaokai Reviewed-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Baoquan He Cc: Changyuan Lyu Cc: David Matlack Cc: David Rientjes Cc: Jason Gunthorpe Cc: Pratyush Yadav Cc: Ran Xiaokai Signed-off-by: Andrew Morton --- Documentation/core-api/kho/abi.rst | 6 + Documentation/core-api/kho/index.rst | 12 + include/linux/kho/abi/kexec_handover.h | 144 ++++++- include/linux/kho_radix_tree.h | 70 ++++ kernel/liveupdate/kexec_handover.c | 619 +++++++++++++++-------------- kernel/liveupdate/kexec_handover_debugfs.c | 3 +- 6 files changed, 547 insertions(+), 307 deletions(-) create mode 100644 include/linux/kho_radix_tree.h (limited to 'include') diff --git a/Documentation/core-api/kho/abi.rst b/Documentation/core-api/kho/abi.rst index 2e63be3486cf..799d743105a6 100644 --- a/Documentation/core-api/kho/abi.rst +++ b/Documentation/core-api/kho/abi.rst @@ -22,6 +22,12 @@ memblock preservation ABI .. kernel-doc:: include/linux/kho/abi/memblock.h :doc: memblock kexec handover ABI +KHO persistent memory tracker ABI +================================= + +.. kernel-doc:: include/linux/kho/abi/kexec_handover.h + :doc: KHO persistent memory tracker + See Also ======== diff --git a/Documentation/core-api/kho/index.rst b/Documentation/core-api/kho/index.rst index dcc6a36cc134..002bdf0beb2e 100644 --- a/Documentation/core-api/kho/index.rst +++ b/Documentation/core-api/kho/index.rst @@ -83,6 +83,18 @@ called serialization. When the FDT is generated, some properties of the system may become immutable because they are already written down in the FDT. That state is called the KHO finalization phase. +Kexec Handover Radix Tree +========================= + +.. kernel-doc:: include/linux/kho_radix_tree.h + :doc: Kexec Handover Radix Tree + +Public API +========== + +.. kernel-doc:: kernel/liveupdate/kexec_handover.c + :export: + See Also ======== diff --git a/include/linux/kho/abi/kexec_handover.h b/include/linux/kho/abi/kexec_handover.h index 2201a0d2c159..6b7d8ef550f9 100644 --- a/include/linux/kho/abi/kexec_handover.h +++ b/include/linux/kho/abi/kexec_handover.h @@ -10,8 +10,13 @@ #ifndef _LINUX_KHO_ABI_KEXEC_HANDOVER_H #define _LINUX_KHO_ABI_KEXEC_HANDOVER_H +#include +#include +#include #include +#include + /** * DOC: Kexec Handover ABI * @@ -29,32 +34,32 @@ * compatibility is only guaranteed for kernels supporting the same ABI version. * * FDT Structure Overview: - * The FDT serves as a central registry for physical - * addresses of preserved data structures and sub-FDTs. The first kernel - * populates this FDT with references to memory regions and other FDTs that - * need to persist across the kexec transition. The subsequent kernel then - * parses this FDT to locate and restore the preserved data.:: + * The FDT serves as a central registry for physical addresses of preserved + * data structures. The first kernel populates this FDT with references to + * memory regions and other metadata that need to persist across the kexec + * transition. The subsequent kernel then parses this FDT to locate and + * restore the preserved data.:: * * / { - * compatible = "kho-v1"; + * compatible = "kho-v2"; * * preserved-memory-map = <0x...>; * * { - * fdt = <0x...>; + * preserved-data = <0x...>; * }; * * { - * fdt = <0x...>; + * preserved-data = <0x...>; * }; * ... ... * { - * fdt = <0x...>; + * preserved-data = <0x...>; * }; * }; * * Root KHO Node (/): - * - compatible: "kho-v1" + * - compatible: "kho-v2" * * Indentifies the overall KHO ABI version. * @@ -69,20 +74,20 @@ * is provided by the subsystem that uses KHO for preserving its * data. * - * - fdt: u64 + * - preserved-data: u64 * - * Physical address pointing to a subnode FDT blob that is also + * Physical address pointing to a subnode data blob that is also * being preserved. */ /* The compatible string for the KHO FDT root node. */ -#define KHO_FDT_COMPATIBLE "kho-v1" +#define KHO_FDT_COMPATIBLE "kho-v2" /* The FDT property for the preserved memory map. */ #define KHO_FDT_MEMORY_MAP_PROP_NAME "preserved-memory-map" -/* The FDT property for sub-FDTs. */ -#define KHO_FDT_SUB_TREE_PROP_NAME "fdt" +/* The FDT property for preserved data blobs. */ +#define KHO_FDT_SUB_TREE_PROP_NAME "preserved-data" /** * DOC: Kexec Handover ABI for vmalloc Preservation @@ -160,4 +165,113 @@ struct kho_vmalloc { unsigned short order; }; +/** + * DOC: KHO persistent memory tracker + * + * KHO tracks preserved memory using a radix tree data structure. Each node of + * the tree is exactly a single page. The leaf nodes are bitmaps where each set + * bit is a preserved page of any order. The intermediate nodes are tables of + * physical addresses that point to a lower level node. + * + * The tree hierarchy is shown below:: + * + * root + * +-------------------+ + * | Level 5 | (struct kho_radix_node) + * +-------------------+ + * | + * v + * +-------------------+ + * | Level 4 | (struct kho_radix_node) + * +-------------------+ + * | + * | ... (intermediate levels) + * | + * v + * +-------------------+ + * | Level 0 | (struct kho_radix_leaf) + * +-------------------+ + * + * The tree is traversed using a key that encodes the page's physical address + * (pa) and its order into a single unsigned long value. The encoded key value + * is composed of two parts: the 'order bit' in the upper part and the + * 'shifted physical address' in the lower part.:: + * + * +------------+-----------------------------+--------------------------+ + * | Page Order | Order Bit | Shifted Physical Address | + * +------------+-----------------------------+--------------------------+ + * | 0 | ...000100 ... (at bit 52) | pa >> (PAGE_SHIFT + 0) | + * | 1 | ...000010 ... (at bit 51) | pa >> (PAGE_SHIFT + 1) | + * | 2 | ...000001 ... (at bit 50) | pa >> (PAGE_SHIFT + 2) | + * | ... | ... | ... | + * +------------+-----------------------------+--------------------------+ + * + * Shifted Physical Address: + * The 'shifted physical address' is the physical address normalized for its + * order. It effectively represents the PFN shifted right by the order. + * + * Order Bit: + * The 'order bit' encodes the page order by setting a single bit at a + * specific position. The position of this bit itself represents the order. + * + * For instance, on a 64-bit system with 4KB pages (PAGE_SHIFT = 12), the + * maximum range for the shifted physical address (for order 0) is 52 bits + * (64 - 12). This address occupies bits [0-51]. For order 0, the order bit is + * set at position 52. + * + * The following diagram illustrates how the encoded key value is split into + * indices for the tree levels, with PAGE_SIZE of 4KB:: + * + * 63:60 59:51 50:42 41:33 32:24 23:15 14:0 + * +---------+--------+--------+--------+--------+--------+-----------------+ + * | 0 | Lv 5 | Lv 4 | Lv 3 | Lv 2 | Lv 1 | Lv 0 (bitmap) | + * +---------+--------+--------+--------+--------+--------+-----------------+ + * + * The radix tree stores pages of all orders in a single 6-level hierarchy. It + * efficiently shares higher tree levels, especially due to common zero top + * address bits, allowing a single, efficient algorithm to manage all + * pages. This bitmap approach also offers memory efficiency; for example, a + * 512KB bitmap can cover a 16GB memory range for 0-order pages with PAGE_SIZE = + * 4KB. + * + * The data structures defined here are part of the KHO ABI. Any modification + * to these structures that breaks backward compatibility must be accompanied by + * an update to the "compatible" string. This ensures that a newer kernel can + * correctly interpret the data passed by an older kernel. + */ + +/* + * Defines constants for the KHO radix tree structure, used to track preserved + * memory. These constants govern the indexing, sizing, and depth of the tree. + */ +enum kho_radix_consts { + /* + * The bit position of the order bit (and also the length of the + * shifted physical address) for an order-0 page. + */ + KHO_ORDER_0_LOG2 = 64 - PAGE_SHIFT, + + /* Size of the table in kho_radix_node, in log2 */ + KHO_TABLE_SIZE_LOG2 = const_ilog2(PAGE_SIZE / sizeof(phys_addr_t)), + + /* Number of bits in the kho_radix_leaf bitmap, in log2 */ + KHO_BITMAP_SIZE_LOG2 = PAGE_SHIFT + const_ilog2(BITS_PER_BYTE), + + /* + * The total tree depth is the number of intermediate levels + * and 1 bitmap level. + */ + KHO_TREE_MAX_DEPTH = + DIV_ROUND_UP(KHO_ORDER_0_LOG2 - KHO_BITMAP_SIZE_LOG2, + KHO_TABLE_SIZE_LOG2) + 1, +}; + +struct kho_radix_node { + u64 table[1 << KHO_TABLE_SIZE_LOG2]; +}; + +struct kho_radix_leaf { + DECLARE_BITMAP(bitmap, 1 << KHO_BITMAP_SIZE_LOG2); +}; + #endif /* _LINUX_KHO_ABI_KEXEC_HANDOVER_H */ diff --git a/include/linux/kho_radix_tree.h b/include/linux/kho_radix_tree.h new file mode 100644 index 000000000000..84e918b96e53 --- /dev/null +++ b/include/linux/kho_radix_tree.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_KHO_RADIX_TREE_H +#define _LINUX_KHO_RADIX_TREE_H + +#include +#include +#include +#include + +/** + * DOC: Kexec Handover Radix Tree + * + * This is a radix tree implementation for tracking physical memory pages + * across kexec transitions. It was developed for the KHO mechanism but is + * designed for broader use by any subsystem that needs to preserve pages. + * + * The radix tree is a multi-level tree where leaf nodes are bitmaps + * representing individual pages. To allow pages of different sizes (orders) + * to be stored efficiently in a single tree, it uses a unique key encoding + * scheme. Each key is an unsigned long that combines a page's physical + * address and its order. + * + * Client code is responsible for allocating the root node of the tree, + * initializing the mutex lock, and managing its lifecycle. It must use the + * tree data structures defined in the KHO ABI, + * `include/linux/kho/abi/kexec_handover.h`. + */ + +struct kho_radix_node; + +struct kho_radix_tree { + struct kho_radix_node *root; + struct mutex lock; /* protects the tree's structure and root pointer */ +}; + +typedef int (*kho_radix_tree_walk_callback_t)(phys_addr_t phys, + unsigned int order); + +#ifdef CONFIG_KEXEC_HANDOVER + +int kho_radix_add_page(struct kho_radix_tree *tree, unsigned long pfn, + unsigned int order); + +void kho_radix_del_page(struct kho_radix_tree *tree, unsigned long pfn, + unsigned int order); + +int kho_radix_walk_tree(struct kho_radix_tree *tree, + kho_radix_tree_walk_callback_t cb); + +#else /* #ifdef CONFIG_KEXEC_HANDOVER */ + +static inline int kho_radix_add_page(struct kho_radix_tree *tree, long pfn, + unsigned int order) +{ + return -EOPNOTSUPP; +} + +static inline void kho_radix_del_page(struct kho_radix_tree *tree, + unsigned long pfn, unsigned int order) { } + +static inline int kho_radix_walk_tree(struct kho_radix_tree *tree, + kho_radix_tree_walk_callback_t cb) +{ + return -EOPNOTSUPP; +} + +#endif /* #ifdef CONFIG_KEXEC_HANDOVER */ + +#endif /* _LINUX_KHO_RADIX_TREE_H */ diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 4356f277b462..ad877926f3f6 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -5,6 +5,7 @@ * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport * Copyright (C) 2025 Google LLC, Changyuan Lyu * Copyright (C) 2025 Pasha Tatashin + * Copyright (C) 2026 Google LLC, Jason Miu */ #define pr_fmt(fmt) "KHO: " fmt @@ -15,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -64,156 +66,308 @@ static int __init kho_parse_enable(char *p) } early_param("kho", kho_parse_enable); -/* - * Keep track of memory that is to be preserved across KHO. - * - * The serializing side uses two levels of xarrays to manage chunks of per-order - * PAGE_SIZE byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order - * of a 8TB system would fit inside a single 4096 byte bitmap. For order 0 - * allocations each bitmap will cover 128M of address space. Thus, for 16G of - * memory at most 512K of bitmap memory will be needed for order 0. - * - * This approach is fully incremental, as the serialization progresses folios - * can continue be aggregated to the tracker. The final step, immediately prior - * to kexec would serialize the xarray information into a linked list for the - * successor kernel to parse. - */ - -#define PRESERVE_BITS (PAGE_SIZE * 8) - -struct kho_mem_phys_bits { - DECLARE_BITMAP(preserve, PRESERVE_BITS); -}; - -static_assert(sizeof(struct kho_mem_phys_bits) == PAGE_SIZE); - -struct kho_mem_phys { - /* - * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized - * to order. - */ - struct xarray phys_bits; -}; - -struct kho_mem_track { - /* Points to kho_mem_phys, each order gets its own bitmap tree */ - struct xarray orders; -}; - -struct khoser_mem_chunk; - struct kho_out { void *fdt; bool finalized; struct mutex lock; /* protects KHO FDT finalization */ - struct kho_mem_track track; + struct kho_radix_tree radix_tree; struct kho_debugfs dbg; }; static struct kho_out kho_out = { .lock = __MUTEX_INITIALIZER(kho_out.lock), - .track = { - .orders = XARRAY_INIT(kho_out.track.orders, 0), + .radix_tree = { + .lock = __MUTEX_INITIALIZER(kho_out.radix_tree.lock), }, .finalized = false, }; -static void *xa_load_or_alloc(struct xarray *xa, unsigned long index) +/** + * kho_radix_encode_key - Encodes a physical address and order into a radix key. + * @phys: The physical address of the page. + * @order: The order of the page. + * + * This function combines a page's physical address and its order into a + * single unsigned long, which is used as a key for all radix tree + * operations. + * + * Return: The encoded unsigned long radix key. + */ +static unsigned long kho_radix_encode_key(phys_addr_t phys, unsigned int order) { - void *res = xa_load(xa, index); + /* Order bits part */ + unsigned long h = 1UL << (KHO_ORDER_0_LOG2 - order); + /* Shifted physical address part */ + unsigned long l = phys >> (PAGE_SHIFT + order); - if (res) - return res; + return h | l; +} - void *elm __free(free_page) = (void *)get_zeroed_page(GFP_KERNEL); +/** + * kho_radix_decode_key - Decodes a radix key back into a physical address and order. + * @key: The unsigned long key to decode. + * @order: An output parameter, a pointer to an unsigned int where the decoded + * page order will be stored. + * + * This function reverses the encoding performed by kho_radix_encode_key(), + * extracting the original physical address and page order from a given key. + * + * Return: The decoded physical address. + */ +static phys_addr_t kho_radix_decode_key(unsigned long key, unsigned int *order) +{ + unsigned int order_bit = fls64(key); + phys_addr_t phys; - if (!elm) - return ERR_PTR(-ENOMEM); + /* order_bit is numbered starting at 1 from fls64 */ + *order = KHO_ORDER_0_LOG2 - order_bit + 1; + /* The order is discarded by the shift */ + phys = key << (PAGE_SHIFT + *order); - if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), PAGE_SIZE))) - return ERR_PTR(-EINVAL); + return phys; +} - res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL); - if (xa_is_err(res)) - return ERR_PTR(xa_err(res)); - else if (res) - return res; +static unsigned long kho_radix_get_bitmap_index(unsigned long key) +{ + return key % (1 << KHO_BITMAP_SIZE_LOG2); +} + +static unsigned long kho_radix_get_table_index(unsigned long key, + unsigned int level) +{ + int s; - return no_free_ptr(elm); + s = ((level - 1) * KHO_TABLE_SIZE_LOG2) + KHO_BITMAP_SIZE_LOG2; + return (key >> s) % (1 << KHO_TABLE_SIZE_LOG2); } -static void __kho_unpreserve_order(struct kho_mem_track *track, unsigned long pfn, - unsigned int order) +/** + * kho_radix_add_page - Marks a page as preserved in the radix tree. + * @tree: The KHO radix tree. + * @pfn: The page frame number of the page to preserve. + * @order: The order of the page. + * + * This function traverses the radix tree based on the key derived from @pfn + * and @order. It sets the corresponding bit in the leaf bitmap to mark the + * page for preservation. If intermediate nodes do not exist along the path, + * they are allocated and added to the tree. + * + * Return: 0 on success, or a negative error code on failure. + */ +int kho_radix_add_page(struct kho_radix_tree *tree, + unsigned long pfn, unsigned int order) { - struct kho_mem_phys_bits *bits; - struct kho_mem_phys *physxa; - const unsigned long pfn_high = pfn >> order; + /* Newly allocated nodes for error cleanup */ + struct kho_radix_node *intermediate_nodes[KHO_TREE_MAX_DEPTH] = { 0 }; + unsigned long key = kho_radix_encode_key(PFN_PHYS(pfn), order); + struct kho_radix_node *anchor_node = NULL; + struct kho_radix_node *node = tree->root; + struct kho_radix_node *new_node; + unsigned int i, idx, anchor_idx; + struct kho_radix_leaf *leaf; + int err = 0; - physxa = xa_load(&track->orders, order); - if (WARN_ON_ONCE(!physxa)) - return; + if (WARN_ON_ONCE(!tree->root)) + return -EINVAL; - bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); - if (WARN_ON_ONCE(!bits)) - return; + might_sleep(); + + guard(mutex)(&tree->lock); + + /* Go from high levels to low levels */ + for (i = KHO_TREE_MAX_DEPTH - 1; i > 0; i--) { + idx = kho_radix_get_table_index(key, i); + + if (node->table[idx]) { + node = phys_to_virt(node->table[idx]); + continue; + } + + /* Next node is empty, create a new node for it */ + new_node = (struct kho_radix_node *)get_zeroed_page(GFP_KERNEL); + if (!new_node) { + err = -ENOMEM; + goto err_free_nodes; + } + + node->table[idx] = virt_to_phys(new_node); - clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); + /* + * Capture the node where the new branch starts for cleanup + * if allocation fails. + */ + if (!anchor_node) { + anchor_node = node; + anchor_idx = idx; + } + intermediate_nodes[i] = new_node; + + node = new_node; + } + + /* Handle the leaf level bitmap (level 0) */ + idx = kho_radix_get_bitmap_index(key); + leaf = (struct kho_radix_leaf *)node; + __set_bit(idx, leaf->bitmap); + + return 0; + +err_free_nodes: + for (i = KHO_TREE_MAX_DEPTH - 1; i > 0; i--) { + if (intermediate_nodes[i]) + free_page((unsigned long)intermediate_nodes[i]); + } + if (anchor_node) + anchor_node->table[anchor_idx] = 0; + + return err; } +EXPORT_SYMBOL_GPL(kho_radix_add_page); -static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, - unsigned long end_pfn) +/** + * kho_radix_del_page - Removes a page's preservation status from the radix tree. + * @tree: The KHO radix tree. + * @pfn: The page frame number of the page to unpreserve. + * @order: The order of the page. + * + * This function traverses the radix tree and clears the bit corresponding to + * the page, effectively removing its "preserved" status. It does not free + * the tree's intermediate nodes, even if they become empty. + */ +void kho_radix_del_page(struct kho_radix_tree *tree, unsigned long pfn, + unsigned int order) { - unsigned int order; + unsigned long key = kho_radix_encode_key(PFN_PHYS(pfn), order); + struct kho_radix_node *node = tree->root; + struct kho_radix_leaf *leaf; + unsigned int i, idx; - while (pfn < end_pfn) { - order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); + if (WARN_ON_ONCE(!tree->root)) + return; + + might_sleep(); - __kho_unpreserve_order(track, pfn, order); + guard(mutex)(&tree->lock); - pfn += 1 << order; + /* Go from high levels to low levels */ + for (i = KHO_TREE_MAX_DEPTH - 1; i > 0; i--) { + idx = kho_radix_get_table_index(key, i); + + /* + * Attempting to delete a page that has not been preserved, + * return with a warning. + */ + if (WARN_ON(!node->table[idx])) + return; + + node = phys_to_virt(node->table[idx]); } + + /* Handle the leaf level bitmap (level 0) */ + leaf = (struct kho_radix_leaf *)node; + idx = kho_radix_get_bitmap_index(key); + __clear_bit(idx, leaf->bitmap); } +EXPORT_SYMBOL_GPL(kho_radix_del_page); -static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, - unsigned int order) +static int kho_radix_walk_leaf(struct kho_radix_leaf *leaf, + unsigned long key, + kho_radix_tree_walk_callback_t cb) { - struct kho_mem_phys_bits *bits; - struct kho_mem_phys *physxa, *new_physxa; - const unsigned long pfn_high = pfn >> order; + unsigned long *bitmap = (unsigned long *)leaf; + unsigned int order; + phys_addr_t phys; + unsigned int i; + int err; - might_sleep(); - physxa = xa_load(&track->orders, order); - if (!physxa) { - int err; + for_each_set_bit(i, bitmap, PAGE_SIZE * BITS_PER_BYTE) { + phys = kho_radix_decode_key(key | i, &order); + err = cb(phys, order); + if (err) + return err; + } - new_physxa = kzalloc_obj(*physxa); - if (!new_physxa) - return -ENOMEM; + return 0; +} + +static int __kho_radix_walk_tree(struct kho_radix_node *root, + unsigned int level, unsigned long start, + kho_radix_tree_walk_callback_t cb) +{ + struct kho_radix_node *node; + struct kho_radix_leaf *leaf; + unsigned long key, i; + unsigned int shift; + int err; + + for (i = 0; i < PAGE_SIZE / sizeof(phys_addr_t); i++) { + if (!root->table[i]) + continue; - xa_init(&new_physxa->phys_bits); - physxa = xa_cmpxchg(&track->orders, order, NULL, new_physxa, - GFP_KERNEL); + shift = ((level - 1) * KHO_TABLE_SIZE_LOG2) + + KHO_BITMAP_SIZE_LOG2; + key = start | (i << shift); - err = xa_err(physxa); - if (err || physxa) { - xa_destroy(&new_physxa->phys_bits); - kfree(new_physxa); + node = phys_to_virt(root->table[i]); - if (err) - return err; + if (level == 1) { + /* + * we are at level 1, + * node is pointing to the level 0 bitmap. + */ + leaf = (struct kho_radix_leaf *)node; + err = kho_radix_walk_leaf(leaf, key, cb); } else { - physxa = new_physxa; + err = __kho_radix_walk_tree(node, level - 1, + key, cb); } + + if (err) + return err; } - bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS); - if (IS_ERR(bits)) - return PTR_ERR(bits); + return 0; +} + +/** + * kho_radix_walk_tree - Traverses the radix tree and calls a callback for each preserved page. + * @tree: A pointer to the KHO radix tree to walk. + * @cb: A callback function of type kho_radix_tree_walk_callback_t that will be + * invoked for each preserved page found in the tree. The callback receives + * the physical address and order of the preserved page. + * + * This function walks the radix tree, searching from the specified top level + * down to the lowest level (level 0). For each preserved page found, it invokes + * the provided callback, passing the page's physical address and order. + * + * Return: 0 if the walk completed the specified tree, or the non-zero return + * value from the callback that stopped the walk. + */ +int kho_radix_walk_tree(struct kho_radix_tree *tree, + kho_radix_tree_walk_callback_t cb) +{ + if (WARN_ON_ONCE(!tree->root)) + return -EINVAL; - set_bit(pfn_high % PRESERVE_BITS, bits->preserve); + guard(mutex)(&tree->lock); - return 0; + return __kho_radix_walk_tree(tree->root, KHO_TREE_MAX_DEPTH - 1, 0, cb); +} +EXPORT_SYMBOL_GPL(kho_radix_walk_tree); + +static void __kho_unpreserve(struct kho_radix_tree *tree, + unsigned long pfn, unsigned long end_pfn) +{ + unsigned int order; + + while (pfn < end_pfn) { + order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); + + kho_radix_del_page(tree, pfn, order); + + pfn += 1 << order; + } } /* For physically contiguous 0-order pages. */ @@ -318,161 +472,24 @@ struct page *kho_restore_pages(phys_addr_t phys, unsigned long nr_pages) } EXPORT_SYMBOL_GPL(kho_restore_pages); -/* Serialize and deserialize struct kho_mem_phys across kexec - * - * Record all the bitmaps in a linked list of pages for the next kernel to - * process. Each chunk holds bitmaps of the same order and each block of bitmaps - * starts at a given physical address. This allows the bitmaps to be sparse. The - * xarray is used to store them in a tree while building up the data structure, - * but the KHO successor kernel only needs to process them once in order. - * - * All of this memory is normal kmalloc() memory and is not marked for - * preservation. The successor kernel will remain isolated to the scratch space - * until it completes processing this list. Once processed all the memory - * storing these ranges will be marked as free. - */ - -struct khoser_mem_bitmap_ptr { - phys_addr_t phys_start; - DECLARE_KHOSER_PTR(bitmap, struct kho_mem_phys_bits *); -}; - -struct khoser_mem_chunk_hdr { - DECLARE_KHOSER_PTR(next, struct khoser_mem_chunk *); - unsigned int order; - unsigned int num_elms; -}; - -#define KHOSER_BITMAP_SIZE \ - ((PAGE_SIZE - sizeof(struct khoser_mem_chunk_hdr)) / \ - sizeof(struct khoser_mem_bitmap_ptr)) - -struct khoser_mem_chunk { - struct khoser_mem_chunk_hdr hdr; - struct khoser_mem_bitmap_ptr bitmaps[KHOSER_BITMAP_SIZE]; -}; - -static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE); - -static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk, - unsigned long order) -{ - struct khoser_mem_chunk *chunk __free(free_page) = NULL; - - chunk = (void *)get_zeroed_page(GFP_KERNEL); - if (!chunk) - return ERR_PTR(-ENOMEM); - - if (WARN_ON(kho_scratch_overlap(virt_to_phys(chunk), PAGE_SIZE))) - return ERR_PTR(-EINVAL); - - chunk->hdr.order = order; - if (cur_chunk) - KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk); - return no_free_ptr(chunk); -} - -static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) -{ - struct khoser_mem_chunk *chunk = first_chunk; - - while (chunk) { - struct khoser_mem_chunk *tmp = chunk; - - chunk = KHOSER_LOAD_PTR(chunk->hdr.next); - free_page((unsigned long)tmp); - } -} - -/* - * Update memory map property, if old one is found discard it via - * kho_mem_ser_free(). - */ -static void kho_update_memory_map(struct khoser_mem_chunk *first_chunk) -{ - void *ptr; - u64 phys; - - ptr = fdt_getprop_w(kho_out.fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, NULL); - - /* Check and discard previous memory map */ - phys = get_unaligned((u64 *)ptr); - if (phys) - kho_mem_ser_free((struct khoser_mem_chunk *)phys_to_virt(phys)); - - /* Update with the new value */ - phys = first_chunk ? (u64)virt_to_phys(first_chunk) : 0; - put_unaligned(phys, (u64 *)ptr); -} - -static int kho_mem_serialize(struct kho_out *kho_out) +static int __init kho_preserved_memory_reserve(phys_addr_t phys, + unsigned int order) { - struct khoser_mem_chunk *first_chunk = NULL; - struct khoser_mem_chunk *chunk = NULL; - struct kho_mem_phys *physxa; - unsigned long order; - int err = -ENOMEM; - - xa_for_each(&kho_out->track.orders, order, physxa) { - struct kho_mem_phys_bits *bits; - unsigned long phys; - - chunk = new_chunk(chunk, order); - if (IS_ERR(chunk)) { - err = PTR_ERR(chunk); - goto err_free; - } - - if (!first_chunk) - first_chunk = chunk; - - xa_for_each(&physxa->phys_bits, phys, bits) { - struct khoser_mem_bitmap_ptr *elm; - - if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) { - chunk = new_chunk(chunk, order); - if (IS_ERR(chunk)) { - err = PTR_ERR(chunk); - goto err_free; - } - } + union kho_page_info info; + struct page *page; + u64 sz; - elm = &chunk->bitmaps[chunk->hdr.num_elms]; - chunk->hdr.num_elms++; - elm->phys_start = (phys * PRESERVE_BITS) - << (order + PAGE_SHIFT); - KHOSER_STORE_PTR(elm->bitmap, bits); - } - } + sz = 1 << (order + PAGE_SHIFT); + page = phys_to_page(phys); - kho_update_memory_map(first_chunk); + /* Reserve the memory preserved in KHO in memblock */ + memblock_reserve(phys, sz); + memblock_reserved_mark_noinit(phys, sz); + info.magic = KHO_PAGE_MAGIC; + info.order = order; + page->private = info.page_private; return 0; - -err_free: - kho_mem_ser_free(first_chunk); - return err; -} - -static void __init deserialize_bitmap(unsigned int order, - struct khoser_mem_bitmap_ptr *elm) -{ - struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap); - unsigned long bit; - - for_each_set_bit(bit, bitmap->preserve, PRESERVE_BITS) { - int sz = 1 << (order + PAGE_SHIFT); - phys_addr_t phys = - elm->phys_start + (bit << (order + PAGE_SHIFT)); - struct page *page = phys_to_page(phys); - union kho_page_info info; - - memblock_reserve(phys, sz); - memblock_reserved_mark_noinit(phys, sz); - info.magic = KHO_PAGE_MAGIC; - info.order = order; - page->private = info.page_private; - } } /* Returns physical address of the preserved memory map from FDT */ @@ -483,25 +500,13 @@ static phys_addr_t __init kho_get_mem_map_phys(const void *fdt) mem_ptr = fdt_getprop(fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, &len); if (!mem_ptr || len != sizeof(u64)) { - pr_err("failed to get preserved memory bitmaps\n"); + pr_err("failed to get preserved memory map\n"); return 0; } return get_unaligned((const u64 *)mem_ptr); } -static void __init kho_mem_deserialize(struct khoser_mem_chunk *chunk) -{ - while (chunk) { - unsigned int i; - - for (i = 0; i != chunk->hdr.num_elms; i++) - deserialize_bitmap(chunk->hdr.order, - &chunk->bitmaps[i]); - chunk = KHOSER_LOAD_PTR(chunk->hdr.next); - } -} - /* * With KHO enabled, memory can become fragmented because KHO regions may * be anywhere in physical address space. The scratch regions give us a @@ -812,14 +817,14 @@ EXPORT_SYMBOL_GPL(kho_remove_subtree); */ int kho_preserve_folio(struct folio *folio) { + struct kho_radix_tree *tree = &kho_out.radix_tree; const unsigned long pfn = folio_pfn(folio); const unsigned int order = folio_order(folio); - struct kho_mem_track *track = &kho_out.track; if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order))) return -EINVAL; - return __kho_preserve_order(track, pfn, order); + return kho_radix_add_page(tree, pfn, order); } EXPORT_SYMBOL_GPL(kho_preserve_folio); @@ -833,11 +838,11 @@ EXPORT_SYMBOL_GPL(kho_preserve_folio); */ void kho_unpreserve_folio(struct folio *folio) { + struct kho_radix_tree *tree = &kho_out.radix_tree; const unsigned long pfn = folio_pfn(folio); const unsigned int order = folio_order(folio); - struct kho_mem_track *track = &kho_out.track; - __kho_unpreserve_order(track, pfn, order); + kho_radix_del_page(tree, pfn, order); } EXPORT_SYMBOL_GPL(kho_unpreserve_folio); @@ -853,7 +858,7 @@ EXPORT_SYMBOL_GPL(kho_unpreserve_folio); */ int kho_preserve_pages(struct page *page, unsigned long nr_pages) { - struct kho_mem_track *track = &kho_out.track; + struct kho_radix_tree *tree = &kho_out.radix_tree; const unsigned long start_pfn = page_to_pfn(page); const unsigned long end_pfn = start_pfn + nr_pages; unsigned long pfn = start_pfn; @@ -869,7 +874,7 @@ int kho_preserve_pages(struct page *page, unsigned long nr_pages) const unsigned int order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); - err = __kho_preserve_order(track, pfn, order); + err = kho_radix_add_page(tree, pfn, order); if (err) { failed_pfn = pfn; break; @@ -879,7 +884,7 @@ int kho_preserve_pages(struct page *page, unsigned long nr_pages) } if (err) - __kho_unpreserve(track, start_pfn, failed_pfn); + __kho_unpreserve(tree, start_pfn, failed_pfn); return err; } @@ -897,11 +902,11 @@ EXPORT_SYMBOL_GPL(kho_preserve_pages); */ void kho_unpreserve_pages(struct page *page, unsigned long nr_pages) { - struct kho_mem_track *track = &kho_out.track; + struct kho_radix_tree *tree = &kho_out.radix_tree; const unsigned long start_pfn = page_to_pfn(page); const unsigned long end_pfn = start_pfn + nr_pages; - __kho_unpreserve(track, start_pfn, end_pfn); + __kho_unpreserve(tree, start_pfn, end_pfn); } EXPORT_SYMBOL_GPL(kho_unpreserve_pages); @@ -960,14 +965,14 @@ err_free: static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk, unsigned short order) { - struct kho_mem_track *track = &kho_out.track; + struct kho_radix_tree *tree = &kho_out.radix_tree; unsigned long pfn = PHYS_PFN(virt_to_phys(chunk)); - __kho_unpreserve(track, pfn, pfn + 1); + __kho_unpreserve(tree, pfn, pfn + 1); for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) { pfn = PHYS_PFN(chunk->phys[i]); - __kho_unpreserve(track, pfn, pfn + (1 << order)); + __kho_unpreserve(tree, pfn, pfn + (1 << order)); } } @@ -1238,16 +1243,10 @@ EXPORT_SYMBOL_GPL(kho_restore_free); int kho_finalize(void) { - int ret; - if (!kho_enable) return -EOPNOTSUPP; guard(mutex)(&kho_out.lock); - ret = kho_mem_serialize(&kho_out); - if (ret) - return ret; - kho_out.finalized = true; return 0; @@ -1262,7 +1261,6 @@ bool kho_finalized(void) struct kho_in { phys_addr_t fdt_phys; phys_addr_t scratch_phys; - phys_addr_t mem_map_phys; struct kho_debugfs dbg; }; @@ -1330,18 +1328,46 @@ int kho_retrieve_subtree(const char *name, phys_addr_t *phys) } EXPORT_SYMBOL_GPL(kho_retrieve_subtree); +static int __init kho_mem_retrieve(const void *fdt) +{ + struct kho_radix_tree tree; + const phys_addr_t *mem; + int len; + + /* Retrieve the KHO radix tree from passed-in FDT. */ + mem = fdt_getprop(fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, &len); + + if (!mem || len != sizeof(*mem)) { + pr_err("failed to get preserved KHO memory tree\n"); + return -ENOENT; + } + + if (!*mem) + return -EINVAL; + + tree.root = phys_to_virt(*mem); + mutex_init(&tree.lock); + return kho_radix_walk_tree(&tree, kho_preserved_memory_reserve); +} + static __init int kho_out_fdt_setup(void) { + struct kho_radix_tree *tree = &kho_out.radix_tree; void *root = kho_out.fdt; - u64 empty_mem_map = 0; + u64 preserved_mem_tree_pa; int err; err = fdt_create(root, PAGE_SIZE); err |= fdt_finish_reservemap(root); err |= fdt_begin_node(root, ""); err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE); - err |= fdt_property(root, KHO_FDT_MEMORY_MAP_PROP_NAME, &empty_mem_map, - sizeof(empty_mem_map)); + + preserved_mem_tree_pa = virt_to_phys(tree->root); + + err |= fdt_property(root, KHO_FDT_MEMORY_MAP_PROP_NAME, + &preserved_mem_tree_pa, + sizeof(preserved_mem_tree_pa)); + err |= fdt_end_node(root); err |= fdt_finish(root); @@ -1350,16 +1376,23 @@ static __init int kho_out_fdt_setup(void) static __init int kho_init(void) { + struct kho_radix_tree *tree = &kho_out.radix_tree; const void *fdt = kho_get_fdt(); int err = 0; if (!kho_enable) return 0; + tree->root = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!tree->root) { + err = -ENOMEM; + goto err_free_scratch; + } + kho_out.fdt = kho_alloc_preserve(PAGE_SIZE); if (IS_ERR(kho_out.fdt)) { err = PTR_ERR(kho_out.fdt); - goto err_free_scratch; + goto err_free_kho_radix_tree_root; } err = kho_debugfs_init(); @@ -1405,6 +1438,9 @@ static __init int kho_init(void) err_free_fdt: kho_unpreserve_free(kho_out.fdt); +err_free_kho_radix_tree_root: + kfree(tree->root); + tree->root = NULL; err_free_scratch: kho_out.fdt = NULL; for (int i = 0; i < kho_scratch_cnt; i++) { @@ -1444,10 +1480,12 @@ static void __init kho_release_scratch(void) void __init kho_memory_init(void) { - if (kho_in.mem_map_phys) { + if (kho_in.scratch_phys) { kho_scratch = phys_to_virt(kho_in.scratch_phys); kho_release_scratch(); - kho_mem_deserialize(phys_to_virt(kho_in.mem_map_phys)); + + if (kho_mem_retrieve(kho_get_fdt())) + kho_in.fdt_phys = 0; } else { kho_reserve_scratch(); } @@ -1525,7 +1563,6 @@ void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, kho_in.fdt_phys = fdt_phys; kho_in.scratch_phys = scratch_phys; - kho_in.mem_map_phys = mem_map_phys; kho_scratch_cnt = scratch_cnt; populated = true; diff --git a/kernel/liveupdate/kexec_handover_debugfs.c b/kernel/liveupdate/kexec_handover_debugfs.c index 2f93939168ab..548033fd8a62 100644 --- a/kernel/liveupdate/kexec_handover_debugfs.c +++ b/kernel/liveupdate/kexec_handover_debugfs.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "kexec_handover_internal.h" static struct dentry *debugfs_root; @@ -139,7 +140,7 @@ __init void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt) const char *name = fdt_get_name(fdt, child, NULL); const u64 *fdt_phys; - fdt_phys = fdt_getprop(fdt, child, "fdt", &len); + fdt_phys = fdt_getprop(fdt, child, KHO_FDT_SUB_TREE_PROP_NAME, &len); if (!fdt_phys) continue; if (len != sizeof(*fdt_phys)) { -- cgit v1.2.3 From b9ec0ed907062a67a7cca2d04e7652aec06a0c35 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 23 Feb 2026 11:01:06 -0500 Subject: mm: vmalloc: streamline vmalloc memory accounting Use a vmstat counter instead of a custom, open-coded atomic. This has the added benefit of making the data available per-node, and prepares for cleaning up the memcg accounting as well. Link: https://lkml.kernel.org/r/20260223160147.3792777-1-hannes@cmpxchg.org Acked-by: Shakeel Butt Signed-off-by: Johannes Weiner Reviewed-by: Roman Gushchin Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Uladzislau Rezki (Sony) Cc: Joshua Hahn Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/proc/meminfo.c | 3 ++- include/linux/mmzone.h | 1 + include/linux/vmalloc.h | 3 --- mm/vmalloc.c | 19 ++++++++++--------- mm/vmstat.c | 1 + 5 files changed, 14 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index a458f1e112fd..549793f44726 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -126,7 +126,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "Committed_AS: ", committed); seq_printf(m, "VmallocTotal: %8lu kB\n", (unsigned long)VMALLOC_TOTAL >> 10); - show_val_kb(m, "VmallocUsed: ", vmalloc_nr_pages()); + show_val_kb(m, "VmallocUsed: ", + global_node_page_state(NR_VMALLOC)); show_val_kb(m, "VmallocChunk: ", 0ul); show_val_kb(m, "Percpu: ", pcpu_nr_pages()); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 546bca95ca40..db41b18a919d 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -220,6 +220,7 @@ enum node_stat_item { NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */ NR_FOLL_PIN_ACQUIRED, /* via: pin_user_page(), gup flag: FOLL_PIN */ NR_FOLL_PIN_RELEASED, /* pages returned via unpin_user_page() */ + NR_VMALLOC, NR_KERNEL_STACK_KB, /* measured in KiB */ #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) NR_KERNEL_SCS_KB, /* measured in KiB */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index e8e94f90d686..3b02c0c6b371 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -286,8 +286,6 @@ int unregister_vmap_purge_notifier(struct notifier_block *nb); #ifdef CONFIG_MMU #define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) -unsigned long vmalloc_nr_pages(void); - int vm_area_map_pages(struct vm_struct *area, unsigned long start, unsigned long end, struct page **pages); void vm_area_unmap_pages(struct vm_struct *area, unsigned long start, @@ -304,7 +302,6 @@ static inline void set_vm_flush_reset_perms(void *addr) #else /* !CONFIG_MMU */ #define VMALLOC_TOTAL 0UL -static inline unsigned long vmalloc_nr_pages(void) { return 0; } static inline void set_vm_flush_reset_perms(void *addr) {} #endif /* CONFIG_MMU */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 61caa55a4402..e9d7c2a8c753 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1068,14 +1068,8 @@ static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); static void drain_vmap_area_work(struct work_struct *work); static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work); -static __cacheline_aligned_in_smp atomic_long_t nr_vmalloc_pages; static __cacheline_aligned_in_smp atomic_long_t vmap_lazy_nr; -unsigned long vmalloc_nr_pages(void) -{ - return atomic_long_read(&nr_vmalloc_pages); -} - static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root) { struct rb_node *n = root->rb_node; @@ -3476,11 +3470,11 @@ void vfree(const void *addr) * High-order allocs for huge vmallocs are split, so * can be freed as an array of order-0 allocations */ + if (!(vm->flags & VM_MAP_PUT_PAGES)) + dec_node_page_state(page, NR_VMALLOC); __free_page(page); cond_resched(); } - if (!(vm->flags & VM_MAP_PUT_PAGES)) - atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages); kvfree(vm->pages); kfree(vm); } @@ -3668,6 +3662,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid, continue; } + mod_node_page_state(page_pgdat(page), NR_VMALLOC, 1 << large_order); + split_page(page, large_order); for (i = 0; i < (1U << large_order); i++) pages[nr_allocated + i] = page + i; @@ -3688,6 +3684,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid, if (!order) { while (nr_allocated < nr_pages) { unsigned int nr, nr_pages_request; + int i; /* * A maximum allowed request is hard-coded and is 100 @@ -3711,6 +3708,9 @@ vm_area_alloc_pages(gfp_t gfp, int nid, nr_pages_request, pages + nr_allocated); + for (i = nr_allocated; i < nr_allocated + nr; i++) + inc_node_page_state(pages[i], NR_VMALLOC); + nr_allocated += nr; /* @@ -3735,6 +3735,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid, if (unlikely(!page)) break; + mod_node_page_state(page_pgdat(page), NR_VMALLOC, 1 << order); + /* * High-order allocations must be able to be treated as * independent small pages by callers (as they can with @@ -3877,7 +3879,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, vmalloc_gfp_adjust(gfp_mask, page_order), node, page_order, nr_small_pages, area->pages); - atomic_long_add(area->nr_pages, &nr_vmalloc_pages); /* All pages of vm should be charged to same memcg, so use first one. */ if (gfp_mask & __GFP_ACCOUNT && area->nr_pages) mod_memcg_page_state(area->pages[0], MEMCG_VMALLOC, diff --git a/mm/vmstat.c b/mm/vmstat.c index 667474773dbc..2370c6fb1fcd 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1255,6 +1255,7 @@ const char * const vmstat_text[] = { [I(NR_KERNEL_MISC_RECLAIMABLE)] = "nr_kernel_misc_reclaimable", [I(NR_FOLL_PIN_ACQUIRED)] = "nr_foll_pin_acquired", [I(NR_FOLL_PIN_RELEASED)] = "nr_foll_pin_released", + [I(NR_VMALLOC)] = "nr_vmalloc", [I(NR_KERNEL_STACK_KB)] = "nr_kernel_stack", #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) [I(NR_KERNEL_SCS_KB)] = "nr_shadow_call_stack", -- cgit v1.2.3 From c466412c73c339e33e83b68770e5b556457c03de Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 23 Feb 2026 11:01:07 -0500 Subject: mm: memcontrol: switch to native NR_VMALLOC vmstat counter Eliminates the custom memcg counter and results in a single, consolidated accounting call in vmalloc code. Link: https://lkml.kernel.org/r/20260223160147.3792777-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Shakeel Butt Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Roman Gushchin Reviewed-by: Vishal Moola (Oracle) Cc: Joshua Hahn Cc: Michal Hocko Cc: Muchun Song Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 1 - mm/memcontrol.c | 4 ++-- mm/vmalloc.c | 16 ++++------------ 3 files changed, 6 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5695776f32c8..5173a9f16721 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -35,7 +35,6 @@ enum memcg_stat_item { MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS, MEMCG_SOCK, MEMCG_PERCPU_B, - MEMCG_VMALLOC, MEMCG_KMEM, MEMCG_ZSWAP_B, MEMCG_ZSWAPPED, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 75df24ffdf25..eb54cdf99624 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -317,6 +317,7 @@ static const unsigned int memcg_node_stat_items[] = { NR_SHMEM_THPS, NR_FILE_THPS, NR_ANON_THPS, + NR_VMALLOC, NR_KERNEL_STACK_KB, NR_PAGETABLE, NR_SECONDARY_PAGETABLE, @@ -352,7 +353,6 @@ static const unsigned int memcg_stat_items[] = { MEMCG_SWAP, MEMCG_SOCK, MEMCG_PERCPU_B, - MEMCG_VMALLOC, MEMCG_KMEM, MEMCG_ZSWAP_B, MEMCG_ZSWAPPED, @@ -1364,7 +1364,7 @@ static const struct memory_stat memory_stats[] = { { "sec_pagetables", NR_SECONDARY_PAGETABLE }, { "percpu", MEMCG_PERCPU_B }, { "sock", MEMCG_SOCK }, - { "vmalloc", MEMCG_VMALLOC }, + { "vmalloc", NR_VMALLOC }, { "shmem", NR_SHMEM }, #ifdef CONFIG_ZSWAP { "zswap", MEMCG_ZSWAP_B }, diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e9d7c2a8c753..6dda97c3799e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3459,9 +3459,6 @@ void vfree(const void *addr) if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS)) vm_reset_perms(vm); - /* All pages of vm should be charged to same memcg, so use first one. */ - if (vm->nr_pages && !(vm->flags & VM_MAP_PUT_PAGES)) - mod_memcg_page_state(vm->pages[0], MEMCG_VMALLOC, -vm->nr_pages); for (i = 0; i < vm->nr_pages; i++) { struct page *page = vm->pages[i]; @@ -3471,7 +3468,7 @@ void vfree(const void *addr) * can be freed as an array of order-0 allocations */ if (!(vm->flags & VM_MAP_PUT_PAGES)) - dec_node_page_state(page, NR_VMALLOC); + mod_lruvec_page_state(page, NR_VMALLOC, -1); __free_page(page); cond_resched(); } @@ -3662,7 +3659,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid, continue; } - mod_node_page_state(page_pgdat(page), NR_VMALLOC, 1 << large_order); + mod_lruvec_page_state(page, NR_VMALLOC, 1 << large_order); split_page(page, large_order); for (i = 0; i < (1U << large_order); i++) @@ -3709,7 +3706,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid, pages + nr_allocated); for (i = nr_allocated; i < nr_allocated + nr; i++) - inc_node_page_state(pages[i], NR_VMALLOC); + mod_lruvec_page_state(pages[i], NR_VMALLOC, 1); nr_allocated += nr; @@ -3735,7 +3732,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid, if (unlikely(!page)) break; - mod_node_page_state(page_pgdat(page), NR_VMALLOC, 1 << order); + mod_lruvec_page_state(page, NR_VMALLOC, 1 << order); /* * High-order allocations must be able to be treated as @@ -3879,11 +3876,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, vmalloc_gfp_adjust(gfp_mask, page_order), node, page_order, nr_small_pages, area->pages); - /* All pages of vm should be charged to same memcg, so use first one. */ - if (gfp_mask & __GFP_ACCOUNT && area->nr_pages) - mod_memcg_page_state(area->pages[0], MEMCG_VMALLOC, - area->nr_pages); - /* * If not enough pages were obtained to accomplish an * allocation request, free them via vfree() if any. -- cgit v1.2.3 From d8d68d8111d894cf2406c2eee814ce1f4cf9e939 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 16 Mar 2026 09:09:06 -0700 Subject: tracing: add __event_in_*irq() helpers Patch series "mm: vmscan: add PID and cgroup ID to vmscan tracepoints", v8. This patch (of 3): Some trace events want to expose in their output if they were triggered in an interrupt or softirq context. Instead of recording this in the event structure itself, as this information is stored in the flags portion of the event header, add helper macros that can be used in the print format: TP_printk("val=%d %s", __entry->val, __event_in_irq() ? "(in-irq)" : "") This will output "(in-irq)" for the event in the trace data if the event was triggered in hard or soft interrupt context. Link: https://lkml.kernel.org/r/20260316160908.42727-1-tballasi@linux.microsoft.com Link: https://lore.kernel.org/all/20251229132942.31a2b583@gandalf.local.home/ Link: https://lkml.kernel.org/r/20260316160908.42727-2-tballasi@linux.microsoft.com Signed-off-by: Steven Rostedt (Google) Signed-off-by: Thomas Ballasi Reviewed-by: Shakeel Butt Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Qi Zheng Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/trace/stages/stage3_trace_output.h | 8 ++++++++ include/trace/stages/stage7_class_define.h | 19 +++++++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'include') diff --git a/include/trace/stages/stage3_trace_output.h b/include/trace/stages/stage3_trace_output.h index fce85ea2df1c..56ec0c0595b1 100644 --- a/include/trace/stages/stage3_trace_output.h +++ b/include/trace/stages/stage3_trace_output.h @@ -150,3 +150,11 @@ #undef __get_buf #define __get_buf(len) trace_seq_acquire(p, (len)) + +#undef __event_in_hardirq +#undef __event_in_softirq +#undef __event_in_irq + +#define __event_in_hardirq() (__entry->ent.flags & TRACE_FLAG_HARDIRQ) +#define __event_in_softirq() (__entry->ent.flags & TRACE_FLAG_SOFTIRQ) +#define __event_in_irq() (__entry->ent.flags & (TRACE_FLAG_HARDIRQ | TRACE_FLAG_SOFTIRQ)) diff --git a/include/trace/stages/stage7_class_define.h b/include/trace/stages/stage7_class_define.h index fcd564a590f4..47008897a795 100644 --- a/include/trace/stages/stage7_class_define.h +++ b/include/trace/stages/stage7_class_define.h @@ -26,6 +26,25 @@ #undef __print_hex_dump #undef __get_buf +#undef __event_in_hardirq +#undef __event_in_softirq +#undef __event_in_irq + +/* + * The TRACE_FLAG_* are enums. Instead of using TRACE_DEFINE_ENUM(), + * use their hardcoded values. These values are parsed by user space + * tooling elsewhere so they will never change. + * + * See "enum trace_flag_type" in linux/trace_events.h: + * TRACE_FLAG_HARDIRQ + * TRACE_FLAG_SOFTIRQ + */ + +/* This is what is displayed in the format files */ +#define __event_in_hardirq() (REC->common_flags & 0x8) +#define __event_in_softirq() (REC->common_flags & 0x10) +#define __event_in_irq() (REC->common_flags & 0x18) + /* * The below is not executed in the kernel. It is only what is * displayed in the print format for userspace to parse. -- cgit v1.2.3 From 874a0a566ede40f3d6062cae8fe1022e616edd1a Mon Sep 17 00:00:00 2001 From: Thomas Ballasi Date: Mon, 16 Mar 2026 09:09:07 -0700 Subject: mm: vmscan: add cgroup IDs to vmscan tracepoints Memory reclaim events are currently difficult to attribute to specific cgroups, making debugging memory pressure issues challenging. This patch adds memory cgroup ID (memcg_id) to key vmscan tracepoints to enable better correlation and analysis. For operations not associated with a specific cgroup, the field is defaulted to 0. Link: https://lkml.kernel.org/r/20260316160908.42727-3-tballasi@linux.microsoft.com Signed-off-by: Thomas Ballasi Acked-by: Shakeel Butt Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Qi Zheng Cc: Steven Rostedt (Google) Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/trace/events/vmscan.h | 83 +++++++++++++++++++++++++------------------ mm/shrinker.c | 6 ++-- mm/vmscan.c | 17 ++++----- 3 files changed, 61 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index ea58e4656abf..c9e637c10f96 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -124,85 +124,92 @@ TRACE_EVENT(mm_vmscan_wakeup_kswapd, DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template, - TP_PROTO(int order, gfp_t gfp_flags), + TP_PROTO(gfp_t gfp_flags, int order, struct mem_cgroup *memcg), - TP_ARGS(order, gfp_flags), + TP_ARGS(gfp_flags, order, memcg), TP_STRUCT__entry( - __field( int, order ) __field( unsigned long, gfp_flags ) + __field( u64, memcg_id ) + __field( int, order ) ), TP_fast_assign( - __entry->order = order; __entry->gfp_flags = (__force unsigned long)gfp_flags; + __entry->order = order; + __entry->memcg_id = mem_cgroup_id(memcg); ), - TP_printk("order=%d gfp_flags=%s", + TP_printk("order=%d gfp_flags=%s memcg_id=%llu", __entry->order, - show_gfp_flags(__entry->gfp_flags)) + show_gfp_flags(__entry->gfp_flags), + __entry->memcg_id) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_begin, - TP_PROTO(int order, gfp_t gfp_flags), + TP_PROTO(gfp_t gfp_flags, int order, struct mem_cgroup *memcg), - TP_ARGS(order, gfp_flags) + TP_ARGS(gfp_flags, order, memcg) ); #ifdef CONFIG_MEMCG DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_reclaim_begin, - TP_PROTO(int order, gfp_t gfp_flags), + TP_PROTO(gfp_t gfp_flags, int order, struct mem_cgroup *memcg), - TP_ARGS(order, gfp_flags) + TP_ARGS(gfp_flags, order, memcg) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_softlimit_reclaim_begin, - TP_PROTO(int order, gfp_t gfp_flags), + TP_PROTO(gfp_t gfp_flags, int order, struct mem_cgroup *memcg), - TP_ARGS(order, gfp_flags) + TP_ARGS(gfp_flags, order, memcg) ); #endif /* CONFIG_MEMCG */ DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_end_template, - TP_PROTO(unsigned long nr_reclaimed), + TP_PROTO(unsigned long nr_reclaimed, struct mem_cgroup *memcg), - TP_ARGS(nr_reclaimed), + TP_ARGS(nr_reclaimed, memcg), TP_STRUCT__entry( __field( unsigned long, nr_reclaimed ) + __field( u64, memcg_id ) ), TP_fast_assign( __entry->nr_reclaimed = nr_reclaimed; + __entry->memcg_id = mem_cgroup_id(memcg); ), - TP_printk("nr_reclaimed=%lu", __entry->nr_reclaimed) + TP_printk("nr_reclaimed=%lu memcg_id=%llu", + __entry->nr_reclaimed, + __entry->memcg_id) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_direct_reclaim_end, - TP_PROTO(unsigned long nr_reclaimed), + TP_PROTO(unsigned long nr_reclaimed, struct mem_cgroup *memcg), - TP_ARGS(nr_reclaimed) + TP_ARGS(nr_reclaimed, memcg) ); #ifdef CONFIG_MEMCG DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_reclaim_end, - TP_PROTO(unsigned long nr_reclaimed), + TP_PROTO(unsigned long nr_reclaimed, struct mem_cgroup *memcg), - TP_ARGS(nr_reclaimed) + TP_ARGS(nr_reclaimed, memcg) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_softlimit_reclaim_end, - TP_PROTO(unsigned long nr_reclaimed), + TP_PROTO(unsigned long nr_reclaimed, struct mem_cgroup *memcg), - TP_ARGS(nr_reclaimed) + TP_ARGS(nr_reclaimed, memcg) ); #endif /* CONFIG_MEMCG */ @@ -210,39 +217,42 @@ TRACE_EVENT(mm_shrink_slab_start, TP_PROTO(struct shrinker *shr, struct shrink_control *sc, long nr_objects_to_shrink, unsigned long cache_items, unsigned long long delta, unsigned long total_scan, - int priority), + int priority, struct mem_cgroup *memcg), TP_ARGS(shr, sc, nr_objects_to_shrink, cache_items, delta, total_scan, - priority), + priority, memcg), TP_STRUCT__entry( __field(struct shrinker *, shr) __field(void *, shrink) - __field(int, nid) __field(long, nr_objects_to_shrink) __field(unsigned long, gfp_flags) __field(unsigned long, cache_items) __field(unsigned long long, delta) __field(unsigned long, total_scan) __field(int, priority) + __field(int, nid) + __field(u64, memcg_id) ), TP_fast_assign( __entry->shr = shr; __entry->shrink = shr->scan_objects; - __entry->nid = sc->nid; __entry->nr_objects_to_shrink = nr_objects_to_shrink; __entry->gfp_flags = (__force unsigned long)sc->gfp_mask; __entry->cache_items = cache_items; __entry->delta = delta; __entry->total_scan = total_scan; __entry->priority = priority; + __entry->nid = sc->nid; + __entry->memcg_id = mem_cgroup_id(memcg); ), - TP_printk("%pS %p: nid: %d objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d", + TP_printk("%pS %p: nid: %d memcg_id: %llu objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d", __entry->shrink, __entry->shr, __entry->nid, + __entry->memcg_id, __entry->nr_objects_to_shrink, show_gfp_flags(__entry->gfp_flags), __entry->cache_items, @@ -253,35 +263,38 @@ TRACE_EVENT(mm_shrink_slab_start, TRACE_EVENT(mm_shrink_slab_end, TP_PROTO(struct shrinker *shr, int nid, int shrinker_retval, - long unused_scan_cnt, long new_scan_cnt, long total_scan), + long unused_scan_cnt, long new_scan_cnt, long total_scan, struct mem_cgroup *memcg), TP_ARGS(shr, nid, shrinker_retval, unused_scan_cnt, new_scan_cnt, - total_scan), + total_scan, memcg), TP_STRUCT__entry( __field(struct shrinker *, shr) - __field(int, nid) __field(void *, shrink) __field(long, unused_scan) __field(long, new_scan) - __field(int, retval) __field(long, total_scan) + __field(int, nid) + __field(int, retval) + __field(u64, memcg_id) ), TP_fast_assign( __entry->shr = shr; - __entry->nid = nid; __entry->shrink = shr->scan_objects; __entry->unused_scan = unused_scan_cnt; __entry->new_scan = new_scan_cnt; - __entry->retval = shrinker_retval; __entry->total_scan = total_scan; + __entry->nid = nid; + __entry->retval = shrinker_retval; + __entry->memcg_id = mem_cgroup_id(memcg); ), - TP_printk("%pS %p: nid: %d unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d", + TP_printk("%pS %p: nid: %d memcg_id: %llu unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d", __entry->shrink, __entry->shr, __entry->nid, + __entry->memcg_id, __entry->unused_scan, __entry->new_scan, __entry->total_scan, @@ -514,9 +527,9 @@ TRACE_EVENT(mm_vmscan_node_reclaim_begin, DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_node_reclaim_end, - TP_PROTO(unsigned long nr_reclaimed), + TP_PROTO(unsigned long nr_reclaimed, struct mem_cgroup *memcg), - TP_ARGS(nr_reclaimed) + TP_ARGS(nr_reclaimed, memcg) ); TRACE_EVENT(mm_vmscan_throttled, diff --git a/mm/shrinker.c b/mm/shrinker.c index 94646ee0af63..0f90d63afdeb 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -410,7 +410,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, total_scan = min(total_scan, (2 * freeable)); trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, - freeable, delta, total_scan, priority); + freeable, delta, total_scan, priority, + shrinkctl->memcg); /* * Normally, we should not scan less than batch_size objects in one @@ -461,7 +462,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, */ new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl); - trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); + trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan, + shrinkctl->memcg); return freed; } diff --git a/mm/vmscan.c b/mm/vmscan.c index d531040a3593..2c954d370048 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6582,11 +6582,11 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, return 1; set_task_reclaim_state(current, &sc.reclaim_state); - trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask); + trace_mm_vmscan_direct_reclaim_begin(sc.gfp_mask, order, 0); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); - trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); + trace_mm_vmscan_direct_reclaim_end(nr_reclaimed, 0); set_task_reclaim_state(current, NULL); return nr_reclaimed; @@ -6615,8 +6615,9 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); - trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, - sc.gfp_mask); + trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.gfp_mask, + sc.order, + memcg); /* * NOTE: Although we can get the priority field, using it @@ -6627,7 +6628,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, */ shrink_lruvec(lruvec, &sc); - trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); + trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed, memcg); *nr_scanned = sc.nr_scanned; @@ -6663,13 +6664,13 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); set_task_reclaim_state(current, &sc.reclaim_state); - trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask); + trace_mm_vmscan_memcg_reclaim_begin(sc.gfp_mask, 0, memcg); noreclaim_flag = memalloc_noreclaim_save(); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); memalloc_noreclaim_restore(noreclaim_flag); - trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); + trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed, memcg); set_task_reclaim_state(current, NULL); return nr_reclaimed; @@ -7643,7 +7644,7 @@ static unsigned long __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, delayacct_freepages_end(); psi_memstall_leave(&pflags); - trace_mm_vmscan_node_reclaim_end(sc->nr_reclaimed); + trace_mm_vmscan_node_reclaim_end(sc->nr_reclaimed, 0); return sc->nr_reclaimed; } -- cgit v1.2.3 From 77a9c445b668765129f877d3c0d08ec4dc3ce77b Mon Sep 17 00:00:00 2001 From: Thomas Ballasi Date: Mon, 16 Mar 2026 09:09:08 -0700 Subject: mm: vmscan: add PIDs to vmscan tracepoints The changes aims at adding additionnal tracepoints variables to help debuggers attribute them to specific processes. Link: https://lkml.kernel.org/r/20260316160908.42727-4-tballasi@linux.microsoft.com Signed-off-by: Thomas Ballasi Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Qi Zheng Cc: Shakeel Butt Cc: Steven Rostedt (Google) Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/trace/events/vmscan.h | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index c9e637c10f96..4445a8d9218d 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -140,10 +140,12 @@ DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template, __entry->memcg_id = mem_cgroup_id(memcg); ), - TP_printk("order=%d gfp_flags=%s memcg_id=%llu", + TP_printk("order=%d gfp_flags=%s pid=%d memcg_id=%llu %s", __entry->order, show_gfp_flags(__entry->gfp_flags), - __entry->memcg_id) + __entry->ent.pid, + __entry->memcg_id, + __event_in_irq() ? "(in-irq)" : "") ); DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_begin, @@ -185,9 +187,11 @@ DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_end_template, __entry->memcg_id = mem_cgroup_id(memcg); ), - TP_printk("nr_reclaimed=%lu memcg_id=%llu", + TP_printk("nr_reclaimed=%lu pid=%d memcg_id=%llu %s", __entry->nr_reclaimed, - __entry->memcg_id) + __entry->ent.pid, + __entry->memcg_id, + __event_in_irq() ? "(in-irq)" : "") ); DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_direct_reclaim_end, @@ -248,17 +252,19 @@ TRACE_EVENT(mm_shrink_slab_start, __entry->memcg_id = mem_cgroup_id(memcg); ), - TP_printk("%pS %p: nid: %d memcg_id: %llu objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d", + TP_printk("%pS %p: nid: %d pid: %d memcg_id: %llu objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d %s", __entry->shrink, __entry->shr, __entry->nid, + __entry->ent.pid, __entry->memcg_id, __entry->nr_objects_to_shrink, show_gfp_flags(__entry->gfp_flags), __entry->cache_items, __entry->delta, __entry->total_scan, - __entry->priority) + __entry->priority, + __event_in_irq() ? "(in-irq)" : "") ); TRACE_EVENT(mm_shrink_slab_end, @@ -290,15 +296,17 @@ TRACE_EVENT(mm_shrink_slab_end, __entry->memcg_id = mem_cgroup_id(memcg); ), - TP_printk("%pS %p: nid: %d memcg_id: %llu unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d", + TP_printk("%pS %p: nid: %d pid: %d memcg_id: %llu unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d %s", __entry->shrink, __entry->shr, __entry->nid, + __entry->ent.pid, __entry->memcg_id, __entry->unused_scan, __entry->new_scan, __entry->total_scan, - __entry->retval) + __entry->retval, + __event_in_irq() ? "(in-irq)" : "") ); TRACE_EVENT(mm_vmscan_lru_isolate, -- cgit v1.2.3 From 1fb3d8c20bfadbbe2d9e5de18074de9282a52b5f Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Tue, 24 Feb 2026 22:21:01 +0800 Subject: mm/mmu_gather: replace IPI with synchronize_rcu() when batch allocation fails When freeing page tables, we try to batch them. If batch allocation fails (GFP_NOWAIT), __tlb_remove_table_one() immediately frees the one without batching. On !CONFIG_PT_RECLAIM, the fallback sends an IPI to all CPUs via tlb_remove_table_sync_one(). It disrupts all CPUs even when only a single process is unmapping memory. IPI broadcast was reported to hurt RT workloads[1]. tlb_remove_table_sync_one() synchronizes with lockless page-table walkers (e.g. GUP-fast) that rely on IRQ disabling. These walkers use local_irq_disable(), which is also an RCU read-side critical section. This patch introduces tlb_remove_table_sync_rcu() which uses RCU grace period (synchronize_rcu()) instead of IPI broadcast. This provides the same guarantee as IPI but without disrupting all CPUs. Since batch allocation already failed, we are in a slow path where sleeping is acceptable - we are in process context (unmap_region, exit_mmap) with only mmap_lock held. tlb_remove_table_sync_one() is retained for other callers (e.g., khugepaged after pmdp_collapse_flush(), tlb_finish_mmu() when tlb->fully_unshared_tables) that are not slow paths. Converting those may require different approaches such as targeted IPIs. Link: https://lore.kernel.org/linux-mm/1b27a3fa-359a-43d0-bdeb-c31341749367@kernel.org/ [1] Link: https://lore.kernel.org/linux-mm/20260202150957.GD1282955@noisy.programming.kicks-ass.net/ Link: https://lore.kernel.org/linux-mm/dfdfeac9-5cd5-46fc-a5c1-9ccf9bd3502a@intel.com/ Link: https://lore.kernel.org/linux-mm/bc489455-bb18-44dc-8518-ae75abda6bec@kernel.org/ Link: https://lkml.kernel.org/r/20260224142101.20500-1-lance.yang@linux.dev Signed-off-by: Lance Yang Suggested-by: Peter Zijlstra (Intel) Suggested-by: Dave Hansen Suggested-by: David Hildenbrand (Arm) Acked-by: David Hildenbrand (Arm) Acked-by: Peter Zijlstra (Intel) Cc: Arnd Bergmann Cc: Nicholas Piggin Cc: Nick Piggin Cc: Will Deacon Signed-off-by: Andrew Morton --- include/asm-generic/tlb.h | 4 ++++ mm/mmu_gather.c | 21 ++++++++++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 4aeac0c3d3f0..bdcc2778ac64 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -251,6 +251,8 @@ static inline void tlb_remove_table(struct mmu_gather *tlb, void *table) void tlb_remove_table_sync_one(void); +void tlb_remove_table_sync_rcu(void); + #else #ifdef tlb_needs_table_invalidate @@ -259,6 +261,8 @@ void tlb_remove_table_sync_one(void); static inline void tlb_remove_table_sync_one(void) { } +static inline void tlb_remove_table_sync_rcu(void) { } + #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index fe5b6a031717..3985d856de7f 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -296,6 +296,25 @@ static void tlb_remove_table_free(struct mmu_table_batch *batch) call_rcu(&batch->rcu, tlb_remove_table_rcu); } +/** + * tlb_remove_table_sync_rcu - synchronize with software page-table walkers + * + * Like tlb_remove_table_sync_one() but uses RCU grace period instead of IPI + * broadcast. Use in slow paths where sleeping is acceptable. + * + * Software/Lockless page-table walkers use local_irq_disable(), which is also + * an RCU read-side critical section. synchronize_rcu() waits for all such + * sections, providing the same guarantee as tlb_remove_table_sync_one() but + * without disrupting all CPUs with IPIs. + * + * Do not use for freeing memory. Use RCU callbacks instead to avoid latency + * spikes. + */ +void tlb_remove_table_sync_rcu(void) +{ + synchronize_rcu(); +} + #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */ static void tlb_remove_table_free(struct mmu_table_batch *batch) @@ -339,7 +358,7 @@ static inline void __tlb_remove_table_one(void *table) #else static inline void __tlb_remove_table_one(void *table) { - tlb_remove_table_sync_one(); + tlb_remove_table_sync_rcu(); __tlb_remove_table(table); } #endif /* CONFIG_PT_RECLAIM */ -- cgit v1.2.3 From 2b8acf8450f577d3785dacfd616630b76dc8f88d Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 25 Feb 2026 16:13:58 +0000 Subject: mm: introduce vm_mmap_shadow_stack() as a helper for VM_SHADOW_STACK mappings Patch series "mm: arch/shstk: Common shadow stack mapping helper and VM_NOHUGEPAGE", v2. A series to extract the common shadow stack mmap into a separate helper for arm64, riscv and x86. This patch (of 5): arm64, riscv and x86 use a similar pattern for mapping the user shadow stack (cloned from x86). Extract this into a helper to facilitate code reuse. The call to do_mmap() from the new helper uses PROT_READ|PROT_WRITE prot bits instead of the PROT_READ with an explicit VM_WRITE vm_flag. The x86 intent was to avoid PROT_WRITE implying normal write since the shadow stack is not writable by normal stores. However, from a kernel perspective, the vma is writeable. Functionally there is no difference. Link: https://lkml.kernel.org/r/20260225161404.3157851-1-catalin.marinas@arm.com Link: https://lkml.kernel.org/r/20260225161404.3157851-2-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Tested-by: Deepak Gupta Reviewed-by: Mark Brown Acked-by: David Hildenbrand (Arm) Reviewed-by: Mike Rapoport (Microsoft) Cc: Albert Ou Cc: Alexandre Ghiti Cc: "Borislav Petkov (AMD)" Cc: "Edgecombe, Rick P" Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Palmer Dabbelt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Will Deacon Cc: Dave Hansen Cc: Paul Walmsley Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ mm/util.c | 25 +++++++++++++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index abb4963c1f06..bb0cfe38ca19 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3903,6 +3903,8 @@ extern int vm_munmap(unsigned long, size_t); extern unsigned long __must_check vm_mmap(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); +extern unsigned long __must_check vm_mmap_shadow_stack(unsigned long addr, + unsigned long len, unsigned long flags); struct vm_unmapped_area_info { #define VM_UNMAPPED_AREA_TOPDOWN 1 diff --git a/mm/util.c b/mm/util.c index b05ab6f97e11..51f7f417e91f 100644 --- a/mm/util.c +++ b/mm/util.c @@ -618,6 +618,31 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, } EXPORT_SYMBOL(vm_mmap); +#ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK +/* + * Perform a userland memory mapping for a shadow stack into the current + * process address space. This is intended to be used by architectures that + * support user shadow stacks. + */ +unsigned long vm_mmap_shadow_stack(unsigned long addr, unsigned long len, + unsigned long flags) +{ + struct mm_struct *mm = current->mm; + unsigned long ret, unused; + + flags |= MAP_ANONYMOUS | MAP_PRIVATE; + if (addr) + flags |= MAP_FIXED_NOREPLACE; + + mmap_write_lock(mm); + ret = do_mmap(NULL, addr, len, PROT_READ | PROT_WRITE, flags, + VM_SHADOW_STACK, 0, &unused, NULL); + mmap_write_unlock(mm); + + return ret; +} +#endif /* CONFIG_ARCH_HAS_USER_SHADOW_STACK */ + /** * __vmalloc_array - allocate memory for a virtually contiguous array. * @n: number of elements. -- cgit v1.2.3 From cbf56f9981014ee48ae9b9e2254f31d1642b8f8f Mon Sep 17 00:00:00 2001 From: Tal Zussman Date: Wed, 25 Feb 2026 18:44:25 -0500 Subject: mm: remove stray references to struct pagevec Patch series "mm: Remove stray references to pagevec", v2. struct pagevec was removed in commit 1e0877d58b1e ("mm: remove struct pagevec"). Remove any stray references to it and rename relevant files and macros accordingly. While at it, remove unnecessary #includes of pagevec.h (now folio_batch.h) in .c files. There are probably more of these that could be removed in .h files, but those are more complex to verify. This patch (of 4): struct pagevec was removed in commit 1e0877d58b1e ("mm: remove struct pagevec"). Remove remaining forward declarations and change __folio_batch_release()'s declaration to match its definition. Link: https://lkml.kernel.org/r/20260225-pagevec_cleanup-v2-0-716868cc2d11@columbia.edu Link: https://lkml.kernel.org/r/20260225-pagevec_cleanup-v2-1-716868cc2d11@columbia.edu Signed-off-by: Tal Zussman Reviewed-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand (Arm) Acked-by: Chris Li Acked-by: Zi Yan Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Christian Brauner Cc: Jan Kara Signed-off-by: Andrew Morton --- fs/afs/internal.h | 1 - fs/f2fs/f2fs.h | 2 -- include/linux/pagevec.h | 2 +- include/linux/swap.h | 2 -- 4 files changed, 1 insertion(+), 6 deletions(-) (limited to 'include') diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 009064b8d661..599353c33337 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -31,7 +31,6 @@ #define AFS_CELL_MAX_ADDRS 15 -struct pagevec; struct afs_call; struct afs_vnode; struct afs_server_probe; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index bb34e864d0ef..d9e8531a5301 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -28,8 +28,6 @@ #include #include -struct pagevec; - #ifdef CONFIG_F2FS_CHECK_FS #define f2fs_bug_on(sbi, condition) BUG_ON(condition) #else diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 63be5a451627..007affabf335 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -93,7 +93,7 @@ static inline struct folio *folio_batch_next(struct folio_batch *fbatch) return fbatch->folios[fbatch->i++]; } -void __folio_batch_release(struct folio_batch *pvec); +void __folio_batch_release(struct folio_batch *fbatch); static inline void folio_batch_release(struct folio_batch *fbatch) { diff --git a/include/linux/swap.h b/include/linux/swap.h index 0effe3cc50f5..4b1f13b5bbad 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -20,8 +20,6 @@ struct notifier_block; struct bio; -struct pagevec; - #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ #define SWAP_FLAG_PRIO_MASK 0x7fff #define SWAP_FLAG_DISCARD 0x10000 /* enable discard for swap */ -- cgit v1.2.3 From 4e1d77a8f382a0ef4dd7732bb1986c8143600def Mon Sep 17 00:00:00 2001 From: Tal Zussman Date: Wed, 25 Feb 2026 18:44:27 -0500 Subject: folio_batch: rename pagevec.h to folio_batch.h struct pagevec was removed in commit 1e0877d58b1e ("mm: remove struct pagevec"). Rename include/linux/pagevec.h to reflect reality and update includes tree-wide. Add the new filename to MAINTAINERS explicitly, as it no longer matches the "include/linux/page[-_]*" pattern in MEMORY MANAGEMENT - CORE. Link: https://lkml.kernel.org/r/20260225-pagevec_cleanup-v2-3-716868cc2d11@columbia.edu Signed-off-by: Tal Zussman Acked-by: David Hildenbrand (Arm) Reviewed-by: Jan Kara Acked-by: Zi Yan Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Chris Li Cc: Christian Brauner Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + drivers/gpu/drm/drm_gem.c | 2 +- drivers/gpu/drm/i915/gem/i915_gem_shmem.c | 2 +- drivers/gpu/drm/i915/gt/intel_gtt.h | 2 +- drivers/gpu/drm/i915/i915_gpu_error.c | 2 +- fs/btrfs/compression.c | 2 +- fs/btrfs/extent_io.c | 2 +- fs/btrfs/tests/extent-io-tests.c | 2 +- fs/buffer.c | 2 +- fs/ceph/addr.c | 2 +- fs/ext4/inode.c | 2 +- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/compress.c | 2 +- fs/f2fs/data.c | 2 +- fs/f2fs/node.c | 2 +- fs/gfs2/aops.c | 2 +- fs/hugetlbfs/inode.c | 2 +- fs/nilfs2/btree.c | 2 +- fs/nilfs2/page.c | 2 +- fs/nilfs2/segment.c | 2 +- fs/ramfs/file-nommu.c | 2 +- include/linux/folio_batch.h | 105 ++++++++++++++++++++++++++++++ include/linux/folio_queue.h | 2 +- include/linux/iomap.h | 2 +- include/linux/pagevec.h | 105 ------------------------------ include/linux/sunrpc/svc.h | 2 +- include/linux/writeback.h | 2 +- mm/filemap.c | 2 +- mm/gup.c | 2 +- mm/memcontrol.c | 2 +- mm/mlock.c | 2 +- mm/page-writeback.c | 2 +- mm/page_alloc.c | 2 +- mm/shmem.c | 2 +- mm/swap.c | 2 +- mm/swap_state.c | 2 +- mm/truncate.c | 2 +- mm/vmscan.c | 2 +- 38 files changed, 141 insertions(+), 140 deletions(-) create mode 100644 include/linux/folio_batch.h delete mode 100644 include/linux/pagevec.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index 7049d85c586e..7a1b94a4aea2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16653,6 +16653,7 @@ L: linux-mm@kvack.org S: Maintained W: http://www.linux-mm.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm +F: include/linux/folio_batch.h F: include/linux/gfp.h F: include/linux/gfp_types.h F: include/linux/highmem.h diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c index 891c3bff5ae0..dc4534fb175c 100644 --- a/drivers/gpu/drm/drm_gem.c +++ b/drivers/gpu/drm/drm_gem.c @@ -38,7 +38,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c index 720a9ad39aa2..06543ae60706 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c @@ -3,7 +3,7 @@ * Copyright © 2014-2016 Intel Corporation */ -#include +#include #include #include #include diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.h b/drivers/gpu/drm/i915/gt/intel_gtt.h index 9d3a3ad567a0..b54ee4f25af1 100644 --- a/drivers/gpu/drm/i915/gt/intel_gtt.h +++ b/drivers/gpu/drm/i915/gt/intel_gtt.h @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index a99b4e45d26c..ffe5f24594c9 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 85199944c1eb..de40b8934725 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 5f97a3d2a8d7..89649ef5107a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include #include "extent_io.h" diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index a0187d6163df..b2aacf846c8b 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -4,7 +4,7 @@ */ #include -#include +#include #include #include #include diff --git a/fs/buffer.c b/fs/buffer.c index 22b43642ba57..f3122160ee2d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -45,7 +45,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 2090fc78529c..bbeafbc777ee 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 396dc3a5d16b..58f982885187 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 6dd39b7de11a..0143365c07dc 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 8c76400ba631..614e00b8ffdc 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include "f2fs.h" #include "node.h" diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 338df7a2aea6..90e8ef625d82 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 74992fd9c9b6..ba0272314528 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include "f2fs.h" diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index e79ad087512a..dae3dc4ee6f7 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 22c799000edb..2ec3e4231252 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index dd0c8e560ef6..b400cfcdc803 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include "nilfs.h" #include "page.h" #include "btnode.h" diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 56c4da417b6a..a9d8aa65416f 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include "nilfs.h" #include "page.h" diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 098a3bd103e0..6d62de64a309 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 0f8e838ece07..2f79bcb89d2e 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/include/linux/folio_batch.h b/include/linux/folio_batch.h new file mode 100644 index 000000000000..a2f3d3043f7e --- /dev/null +++ b/include/linux/folio_batch.h @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * include/linux/folio_batch.h + * + * In many places it is efficient to batch an operation up against multiple + * folios. A folio_batch is a container which is used for that. + */ + +#ifndef _LINUX_FOLIO_BATCH_H +#define _LINUX_FOLIO_BATCH_H + +#include + +/* 31 pointers + header align the folio_batch structure to a power of two */ +#define PAGEVEC_SIZE 31 + +struct folio; + +/** + * struct folio_batch - A collection of folios. + * + * The folio_batch is used to amortise the cost of retrieving and + * operating on a set of folios. The order of folios in the batch may be + * significant (eg delete_from_page_cache_batch()). Some users of the + * folio_batch store "exceptional" entries in it which can be removed + * by calling folio_batch_remove_exceptionals(). + */ +struct folio_batch { + unsigned char nr; + unsigned char i; + bool percpu_pvec_drained; + struct folio *folios[PAGEVEC_SIZE]; +}; + +/** + * folio_batch_init() - Initialise a batch of folios + * @fbatch: The folio batch. + * + * A freshly initialised folio_batch contains zero folios. + */ +static inline void folio_batch_init(struct folio_batch *fbatch) +{ + fbatch->nr = 0; + fbatch->i = 0; + fbatch->percpu_pvec_drained = false; +} + +static inline void folio_batch_reinit(struct folio_batch *fbatch) +{ + fbatch->nr = 0; + fbatch->i = 0; +} + +static inline unsigned int folio_batch_count(const struct folio_batch *fbatch) +{ + return fbatch->nr; +} + +static inline unsigned int folio_batch_space(const struct folio_batch *fbatch) +{ + return PAGEVEC_SIZE - fbatch->nr; +} + +/** + * folio_batch_add() - Add a folio to a batch. + * @fbatch: The folio batch. + * @folio: The folio to add. + * + * The folio is added to the end of the batch. + * The batch must have previously been initialised using folio_batch_init(). + * + * Return: The number of slots still available. + */ +static inline unsigned folio_batch_add(struct folio_batch *fbatch, + struct folio *folio) +{ + fbatch->folios[fbatch->nr++] = folio; + return folio_batch_space(fbatch); +} + +/** + * folio_batch_next - Return the next folio to process. + * @fbatch: The folio batch being processed. + * + * Use this function to implement a queue of folios. + * + * Return: The next folio in the queue, or NULL if the queue is empty. + */ +static inline struct folio *folio_batch_next(struct folio_batch *fbatch) +{ + if (fbatch->i == fbatch->nr) + return NULL; + return fbatch->folios[fbatch->i++]; +} + +void __folio_batch_release(struct folio_batch *fbatch); + +static inline void folio_batch_release(struct folio_batch *fbatch) +{ + if (folio_batch_count(fbatch)) + __folio_batch_release(fbatch); +} + +void folio_batch_remove_exceptionals(struct folio_batch *fbatch); +#endif /* _LINUX_FOLIO_BATCH_H */ diff --git a/include/linux/folio_queue.h b/include/linux/folio_queue.h index adab609c972e..0d3765fa9d1d 100644 --- a/include/linux/folio_queue.h +++ b/include/linux/folio_queue.h @@ -14,7 +14,7 @@ #ifndef _LINUX_FOLIO_QUEUE_H #define _LINUX_FOLIO_QUEUE_H -#include +#include #include /* diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 99b7209dabd7..4551613cea2f 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -9,7 +9,7 @@ #include #include #include -#include +#include struct address_space; struct fiemap_extent_info; diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h deleted file mode 100644 index 007affabf335..000000000000 --- a/include/linux/pagevec.h +++ /dev/null @@ -1,105 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * include/linux/pagevec.h - * - * In many places it is efficient to batch an operation up against multiple - * folios. A folio_batch is a container which is used for that. - */ - -#ifndef _LINUX_PAGEVEC_H -#define _LINUX_PAGEVEC_H - -#include - -/* 31 pointers + header align the folio_batch structure to a power of two */ -#define PAGEVEC_SIZE 31 - -struct folio; - -/** - * struct folio_batch - A collection of folios. - * - * The folio_batch is used to amortise the cost of retrieving and - * operating on a set of folios. The order of folios in the batch may be - * significant (eg delete_from_page_cache_batch()). Some users of the - * folio_batch store "exceptional" entries in it which can be removed - * by calling folio_batch_remove_exceptionals(). - */ -struct folio_batch { - unsigned char nr; - unsigned char i; - bool percpu_pvec_drained; - struct folio *folios[PAGEVEC_SIZE]; -}; - -/** - * folio_batch_init() - Initialise a batch of folios - * @fbatch: The folio batch. - * - * A freshly initialised folio_batch contains zero folios. - */ -static inline void folio_batch_init(struct folio_batch *fbatch) -{ - fbatch->nr = 0; - fbatch->i = 0; - fbatch->percpu_pvec_drained = false; -} - -static inline void folio_batch_reinit(struct folio_batch *fbatch) -{ - fbatch->nr = 0; - fbatch->i = 0; -} - -static inline unsigned int folio_batch_count(const struct folio_batch *fbatch) -{ - return fbatch->nr; -} - -static inline unsigned int folio_batch_space(const struct folio_batch *fbatch) -{ - return PAGEVEC_SIZE - fbatch->nr; -} - -/** - * folio_batch_add() - Add a folio to a batch. - * @fbatch: The folio batch. - * @folio: The folio to add. - * - * The folio is added to the end of the batch. - * The batch must have previously been initialised using folio_batch_init(). - * - * Return: The number of slots still available. - */ -static inline unsigned folio_batch_add(struct folio_batch *fbatch, - struct folio *folio) -{ - fbatch->folios[fbatch->nr++] = folio; - return folio_batch_space(fbatch); -} - -/** - * folio_batch_next - Return the next folio to process. - * @fbatch: The folio batch being processed. - * - * Use this function to implement a queue of folios. - * - * Return: The next folio in the queue, or NULL if the queue is empty. - */ -static inline struct folio *folio_batch_next(struct folio_batch *fbatch) -{ - if (fbatch->i == fbatch->nr) - return NULL; - return fbatch->folios[fbatch->i++]; -} - -void __folio_batch_release(struct folio_batch *fbatch); - -static inline void folio_batch_release(struct folio_batch *fbatch) -{ - if (folio_batch_count(fbatch)) - __folio_batch_release(fbatch); -} - -void folio_batch_remove_exceptionals(struct folio_batch *fbatch); -#endif /* _LINUX_PAGEVEC_H */ diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 4dc14c7a711b..a11acf5cd63b 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include /* diff --git a/include/linux/writeback.h b/include/linux/writeback.h index e530112c4b3a..62552a2ce5b9 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -11,7 +11,7 @@ #include #include #include -#include +#include struct bio; diff --git a/mm/filemap.c b/mm/filemap.c index 406cef06b684..7cc6607dc28f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/mm/gup.c b/mm/gup.c index 8e7dc2c6ee73..ad9ded39609c 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include diff --git a/mm/memcontrol.c b/mm/memcontrol.c index eb54cdf99624..87614cfc4a3e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/mm/mlock.c b/mm/mlock.c index 2f699c3497a5..1a92d16f3684 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 601a5e048d12..1009bb042ba4 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d88c8c67ac0b..74b603872f34 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/mm/shmem.c b/mm/shmem.c index cfed6c3ff853..149fdb051170 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -61,7 +61,7 @@ static struct vfsmount *shm_mnt __ro_after_init; #include #include #include -#include +#include #include #include #include diff --git a/mm/swap.c b/mm/swap.c index bb19ccbece46..2e517ede6561 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/mm/swap_state.c b/mm/swap_state.c index 32d9d877bda8..a0c64db2b275 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/mm/truncate.c b/mm/truncate.c index 12467c1bd711..df0b7a7e6aff 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/mm/vmscan.c b/mm/vmscan.c index 2c954d370048..4ab461f8c65a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -44,7 +44,7 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.2.3 From 511f04aac469a3ae04f7f2588101020aebb19c90 Mon Sep 17 00:00:00 2001 From: Tal Zussman Date: Wed, 25 Feb 2026 18:44:28 -0500 Subject: folio_batch: rename PAGEVEC_SIZE to FOLIO_BATCH_SIZE struct pagevec no longer exists. Rename the macro appropriately. Link: https://lkml.kernel.org/r/20260225-pagevec_cleanup-v2-4-716868cc2d11@columbia.edu Signed-off-by: Tal Zussman Acked-by: David Hildenbrand (Arm) Reviewed-by: Jan Kara Acked-by: Zi Yan Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Chris Li Cc: Christian Brauner Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- fs/btrfs/extent_io.c | 4 ++-- include/linux/folio_batch.h | 6 +++--- include/linux/folio_queue.h | 6 +++--- mm/shmem.c | 4 ++-- mm/swap.c | 2 +- mm/swap_state.c | 2 +- mm/truncate.c | 6 +++--- 7 files changed, 15 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 89649ef5107a..070c8759b0b4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2095,13 +2095,13 @@ static void buffer_tree_tag_for_writeback(struct btrfs_fs_info *fs_info, struct eb_batch { unsigned int nr; unsigned int cur; - struct extent_buffer *ebs[PAGEVEC_SIZE]; + struct extent_buffer *ebs[FOLIO_BATCH_SIZE]; }; static inline bool eb_batch_add(struct eb_batch *batch, struct extent_buffer *eb) { batch->ebs[batch->nr++] = eb; - return (batch->nr < PAGEVEC_SIZE); + return (batch->nr < FOLIO_BATCH_SIZE); } static inline void eb_batch_init(struct eb_batch *batch) diff --git a/include/linux/folio_batch.h b/include/linux/folio_batch.h index a2f3d3043f7e..b45946adc50b 100644 --- a/include/linux/folio_batch.h +++ b/include/linux/folio_batch.h @@ -12,7 +12,7 @@ #include /* 31 pointers + header align the folio_batch structure to a power of two */ -#define PAGEVEC_SIZE 31 +#define FOLIO_BATCH_SIZE 31 struct folio; @@ -29,7 +29,7 @@ struct folio_batch { unsigned char nr; unsigned char i; bool percpu_pvec_drained; - struct folio *folios[PAGEVEC_SIZE]; + struct folio *folios[FOLIO_BATCH_SIZE]; }; /** @@ -58,7 +58,7 @@ static inline unsigned int folio_batch_count(const struct folio_batch *fbatch) static inline unsigned int folio_batch_space(const struct folio_batch *fbatch) { - return PAGEVEC_SIZE - fbatch->nr; + return FOLIO_BATCH_SIZE - fbatch->nr; } /** diff --git a/include/linux/folio_queue.h b/include/linux/folio_queue.h index 0d3765fa9d1d..f6d5f1f127c9 100644 --- a/include/linux/folio_queue.h +++ b/include/linux/folio_queue.h @@ -29,12 +29,12 @@ */ struct folio_queue { struct folio_batch vec; /* Folios in the queue segment */ - u8 orders[PAGEVEC_SIZE]; /* Order of each folio */ + u8 orders[FOLIO_BATCH_SIZE]; /* Order of each folio */ struct folio_queue *next; /* Next queue segment or NULL */ struct folio_queue *prev; /* Previous queue segment of NULL */ unsigned long marks; /* 1-bit mark per folio */ unsigned long marks2; /* Second 1-bit mark per folio */ -#if PAGEVEC_SIZE > BITS_PER_LONG +#if FOLIO_BATCH_SIZE > BITS_PER_LONG #error marks is not big enough #endif unsigned int rreq_id; @@ -70,7 +70,7 @@ static inline void folioq_init(struct folio_queue *folioq, unsigned int rreq_id) */ static inline unsigned int folioq_nr_slots(const struct folio_queue *folioq) { - return PAGEVEC_SIZE; + return FOLIO_BATCH_SIZE; } /** diff --git a/mm/shmem.c b/mm/shmem.c index 149fdb051170..5e7dcf5bc5d3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1113,7 +1113,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, uoff_t lend, pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; pgoff_t end = (lend + 1) >> PAGE_SHIFT; struct folio_batch fbatch; - pgoff_t indices[PAGEVEC_SIZE]; + pgoff_t indices[FOLIO_BATCH_SIZE]; struct folio *folio; bool same_folio; long nr_swaps_freed = 0; @@ -1510,7 +1510,7 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type) struct address_space *mapping = inode->i_mapping; pgoff_t start = 0; struct folio_batch fbatch; - pgoff_t indices[PAGEVEC_SIZE]; + pgoff_t indices[FOLIO_BATCH_SIZE]; int ret = 0; do { diff --git a/mm/swap.c b/mm/swap.c index 2e517ede6561..78b4aa811fc6 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1018,7 +1018,7 @@ EXPORT_SYMBOL(folios_put_refs); void release_pages(release_pages_arg arg, int nr) { struct folio_batch fbatch; - int refs[PAGEVEC_SIZE]; + int refs[FOLIO_BATCH_SIZE]; struct encoded_page **encoded = arg.encoded_pages; int i; diff --git a/mm/swap_state.c b/mm/swap_state.c index a0c64db2b275..6313b59d7eab 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -385,7 +385,7 @@ void free_folio_and_swap_cache(struct folio *folio) void free_pages_and_swap_cache(struct encoded_page **pages, int nr) { struct folio_batch folios; - unsigned int refs[PAGEVEC_SIZE]; + unsigned int refs[FOLIO_BATCH_SIZE]; folio_batch_init(&folios); for (int i = 0; i < nr; i++) { diff --git a/mm/truncate.c b/mm/truncate.c index df0b7a7e6aff..2931d66c16d0 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -369,7 +369,7 @@ void truncate_inode_pages_range(struct address_space *mapping, pgoff_t start; /* inclusive */ pgoff_t end; /* exclusive */ struct folio_batch fbatch; - pgoff_t indices[PAGEVEC_SIZE]; + pgoff_t indices[FOLIO_BATCH_SIZE]; pgoff_t index; int i; struct folio *folio; @@ -534,7 +534,7 @@ EXPORT_SYMBOL(truncate_inode_pages_final); unsigned long mapping_try_invalidate(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_failed) { - pgoff_t indices[PAGEVEC_SIZE]; + pgoff_t indices[FOLIO_BATCH_SIZE]; struct folio_batch fbatch; pgoff_t index = start; unsigned long ret; @@ -672,7 +672,7 @@ failed: int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { - pgoff_t indices[PAGEVEC_SIZE]; + pgoff_t indices[FOLIO_BATCH_SIZE]; struct folio_batch fbatch; pgoff_t index; int i; -- cgit v1.2.3 From a2c77ec320a99581e8272868ccfa53a7d7a7b168 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:39 +0000 Subject: mm: move MAX_FOLIO_ORDER definition to mmzone.h Patch series "mm: Eliminate fake head pages from vmemmap optimization", v7. This series removes "fake head pages" from the HugeTLB vmemmap optimization (HVO) by changing how tail pages encode their relationship to the head page. It simplifies compound_head() and page_ref_add_unless(). Both are in the hot path. Background ========== HVO reduces memory overhead by freeing vmemmap pages for HugeTLB pages and remapping the freed virtual addresses to a single physical page. Previously, all tail page vmemmap entries were remapped to the first vmemmap page (containing the head struct page), creating "fake heads" - tail pages that appear to have PG_head set when accessed through the deduplicated vmemmap. This required special handling in compound_head() to detect and work around fake heads, adding complexity and overhead to a very hot path. New Approach ============ For architectures/configs where sizeof(struct page) is a power of 2 (the common case), this series changes how position of the head page is encoded in the tail pages. Instead of storing a pointer to the head page, the ->compound_info (renamed from ->compound_head) now stores a mask. The mask can be applied to any tail page's virtual address to compute the head page address. Critically, all tail pages of the same order now have identical compound_info values, regardless of which compound page they belong to. The key insight is that all tail pages of the same order now have identical compound_info values, regardless of which compound page they belong to. In v7, these shared tail pages are allocated per-zone. This ensures that zone information (stored in page->flags) is correct even for shared tail pages, removing the need for the special-casing in page_zonenum() proposed in earlier versions. To support per-zone shared pages for boot-allocated gigantic pages, the vmemmap population is deferred until zones are initialized. This simplifies the logic significantly and allows the removal of vmemmap_undo_hvo(). Benefits ======== 1. Simplified compound_head(): No fake head detection needed, can be implemented in a branchless manner. 2. Simplified page_ref_add_unless(): RCU protection removed since there's no race with fake head remapping. 3. Cleaner architecture: The shared tail pages are truly read-only and contain valid tail page metadata. If sizeof(struct page) is not power-of-2, there are no functional changes. HVO is not supported in this configuration. I had hoped to see performance improvement, but my testing thus far has shown either no change or only a slight improvement within the noise. Series Organization =================== Patch 1: Move MAX_FOLIO_ORDER definition to mmzone.h. Patches 2-4: Refactoring of field names and interfaces. Patches 5-6: Architecture alignment for LoongArch and RISC-V. Patch 7: Mask-based compound_head() implementation. Patch 8: Add memmap alignment checks. Patch 9: Branchless compound_head() optimization. Patch 10: Defer vmemmap population for bootmem hugepages. Patch 11: Refactor vmemmap_walk. Patch 12: x86 vDSO build fix. Patch 13: Eliminate fake heads with per-zone shared tail pages. Patches 14-16: Cleanup of fake head infrastructure. Patch 17: Documentation update. Patch 18: Use compound_head() in page_slab(). This patch (of 17): Move MAX_FOLIO_ORDER definition from mm.h to mmzone.h. This is preparation for adding the vmemmap_tails array to struct zone, which requires MAX_FOLIO_ORDER to be available in mmzone.h. Link: https://lkml.kernel.org/r/20260227194302.274384-1-kas@kernel.org Link: https://lkml.kernel.org/r/20260227194302.274384-2-kas@kernel.org Signed-off-by: Kiryl Shutsemau Acked-by: David Hildenbrand (Red Hat) Acked-by: Zi Yan Acked-by: Muchun Song Acked-by: Usama Arif Reviewed-by: Vlastimil Babka Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: WANG Xuerui Signed-off-by: Andrew Morton --- include/linux/mm.h | 31 ------------------------------- include/linux/mmzone.h | 31 +++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index bb0cfe38ca19..4e999c21d89a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include @@ -2479,36 +2478,6 @@ static inline unsigned long folio_nr_pages(const struct folio *folio) return folio_large_nr_pages(folio); } -#if !defined(CONFIG_HAVE_GIGANTIC_FOLIOS) -/* - * We don't expect any folios that exceed buddy sizes (and consequently - * memory sections). - */ -#define MAX_FOLIO_ORDER MAX_PAGE_ORDER -#elif defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) -/* - * Only pages within a single memory section are guaranteed to be - * contiguous. By limiting folios to a single memory section, all folio - * pages are guaranteed to be contiguous. - */ -#define MAX_FOLIO_ORDER PFN_SECTION_SHIFT -#elif defined(CONFIG_HUGETLB_PAGE) -/* - * There is no real limit on the folio size. We limit them to the maximum we - * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect - * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit. - */ -#define MAX_FOLIO_ORDER get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G) -#else -/* - * Without hugetlb, gigantic folios that are bigger than a single PUD are - * currently impossible. - */ -#define MAX_FOLIO_ORDER PUD_ORDER -#endif - -#define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER) - /* * compound_nr() returns the number of pages in this potentially compound * page. compound_nr() can be called on a tail page, and is defined to diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index db41b18a919d..4c481ec77da9 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -23,6 +23,7 @@ #include #include #include +#include #include /* Free memory management - zoned buddy allocator. */ @@ -61,6 +62,36 @@ */ #define PAGE_ALLOC_COSTLY_ORDER 3 +#if !defined(CONFIG_HAVE_GIGANTIC_FOLIOS) +/* + * We don't expect any folios that exceed buddy sizes (and consequently + * memory sections). + */ +#define MAX_FOLIO_ORDER MAX_PAGE_ORDER +#elif defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) +/* + * Only pages within a single memory section are guaranteed to be + * contiguous. By limiting folios to a single memory section, all folio + * pages are guaranteed to be contiguous. + */ +#define MAX_FOLIO_ORDER PFN_SECTION_SHIFT +#elif defined(CONFIG_HUGETLB_PAGE) +/* + * There is no real limit on the folio size. We limit them to the maximum we + * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect + * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit. + */ +#define MAX_FOLIO_ORDER get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G) +#else +/* + * Without hugetlb, gigantic folios that are bigger than a single PUD are + * currently impossible. + */ +#define MAX_FOLIO_ORDER PUD_ORDER +#endif + +#define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER) + enum migratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, -- cgit v1.2.3 From f0369fb13619569ba8564ce8d4fc9d385bbee8a2 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:40 +0000 Subject: mm: change the interface of prep_compound_tail() Instead of passing down the head page and tail page index, pass the tail and head pages directly, as well as the order of the compound page. This is a preparation for changing how the head position is encoded in the tail page. Link: https://lkml.kernel.org/r/20260227194302.274384-3-kas@kernel.org Signed-off-by: Kiryl Shutsemau Reviewed-by: Muchun Song Reviewed-by: Zi Yan Acked-by: David Hildenbrand (arm) Reviewed-by: Vlastimil Babka Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: WANG Xuerui Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 5 +++-- mm/hugetlb.c | 8 +++++--- mm/internal.h | 11 +++++------ mm/mm_init.c | 2 +- mm/page_alloc.c | 2 +- 5 files changed, 15 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 415e9f2ef616..7729a4a28b44 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -870,9 +870,10 @@ static inline bool folio_test_large(const struct folio *folio) return folio_test_head(folio); } -static __always_inline void set_compound_head(struct page *page, struct page *head) +static __always_inline void set_compound_head(struct page *tail, + const struct page *head, unsigned int order) { - WRITE_ONCE(page->compound_head, (unsigned long)head + 1); + WRITE_ONCE(tail->compound_head, (unsigned long)head + 1); } static __always_inline void clear_compound_head(struct page *page) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 327eaa4074d3..1d41fa3dd43e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3168,6 +3168,7 @@ found: /* Initialize [start_page:end_page_number] tail struct pages of a hugepage */ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio, + struct hstate *h, unsigned long start_page_number, unsigned long end_page_number) { @@ -3176,6 +3177,7 @@ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio, struct page *page = folio_page(folio, start_page_number); unsigned long head_pfn = folio_pfn(folio); unsigned long pfn, end_pfn = head_pfn + end_page_number; + unsigned int order = huge_page_order(h); /* * As we marked all tail pages with memblock_reserved_mark_noinit(), @@ -3183,7 +3185,7 @@ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio, */ for (pfn = head_pfn + start_page_number; pfn < end_pfn; page++, pfn++) { __init_single_page(page, pfn, zone, nid); - prep_compound_tail((struct page *)folio, pfn - head_pfn); + prep_compound_tail(page, &folio->page, order); set_page_count(page, 0); } } @@ -3203,7 +3205,7 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio, __folio_set_head(folio); ret = folio_ref_freeze(folio, 1); VM_BUG_ON(!ret); - hugetlb_folio_init_tail_vmemmap(folio, 1, nr_pages); + hugetlb_folio_init_tail_vmemmap(folio, h, 1, nr_pages); prep_compound_head(&folio->page, huge_page_order(h)); } @@ -3260,7 +3262,7 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h, * time as this is early in boot and there should * be no contention. */ - hugetlb_folio_init_tail_vmemmap(folio, + hugetlb_folio_init_tail_vmemmap(folio, h, HUGETLB_VMEMMAP_RESERVE_PAGES, pages_per_huge_page(h)); } diff --git a/mm/internal.h b/mm/internal.h index 2daa6a744172..9cfbd8e41914 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -897,13 +897,12 @@ static inline void prep_compound_head(struct page *page, unsigned int order) INIT_LIST_HEAD(&folio->_deferred_list); } -static inline void prep_compound_tail(struct page *head, int tail_idx) +static inline void prep_compound_tail(struct page *tail, + const struct page *head, unsigned int order) { - struct page *p = head + tail_idx; - - p->mapping = TAIL_MAPPING; - set_compound_head(p, head); - set_page_private(p, 0); + tail->mapping = TAIL_MAPPING; + set_compound_head(tail, head, order); + set_page_private(tail, 0); } void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); diff --git a/mm/mm_init.c b/mm/mm_init.c index f903747ca854..5b261f86ba6f 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1110,7 +1110,7 @@ static void __ref memmap_init_compound(struct page *head, struct page *page = pfn_to_page(pfn); __init_zone_device_page(page, pfn, zone_idx, nid, pgmap); - prep_compound_tail(head, pfn - head_pfn); + prep_compound_tail(page, head, order); set_page_count(page, 0); } prep_compound_head(head, order); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 74b603872f34..11f9a0525a3a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -744,7 +744,7 @@ void prep_compound_page(struct page *page, unsigned int order) __SetPageHead(page); for (i = 1; i < nr_pages; i++) - prep_compound_tail(page, i); + prep_compound_tail(page + i, page, order); prep_compound_head(page, order); } -- cgit v1.2.3 From d50569612c29215c5d1c64f47a65604ed265d2e9 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:41 +0000 Subject: mm: rename the 'compound_head' field in the 'struct page' to 'compound_info' The 'compound_head' field in the 'struct page' encodes whether the page is a tail and where to locate the head page. Bit 0 is set if the page is a tail, and the remaining bits in the field point to the head page. As preparation for changing how the field encodes information about the head page, rename the field to 'compound_info'. Link: https://lkml.kernel.org/r/20260227194302.274384-4-kas@kernel.org Signed-off-by: Kiryl Shutsemau Reviewed-by: Muchun Song Reviewed-by: Zi Yan Acked-by: David Hildenbrand (arm) Reviewed-by: Vlastimil Babka Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: WANG Xuerui Signed-off-by: Andrew Morton --- Documentation/admin-guide/kdump/vmcoreinfo.rst | 2 +- Documentation/mm/vmemmap_dedup.rst | 6 +++--- include/linux/mm_types.h | 20 ++++++++++---------- include/linux/page-flags.h | 18 +++++++++--------- include/linux/types.h | 2 +- kernel/vmcore_info.c | 2 +- mm/page_alloc.c | 2 +- mm/slab.h | 2 +- mm/util.c | 2 +- 9 files changed, 28 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst index 404a15f6782c..7663c610fe90 100644 --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst @@ -141,7 +141,7 @@ nodemask_t The size of a nodemask_t type. Used to compute the number of online nodes. -(page, flags|_refcount|mapping|lru|_mapcount|private|compound_order|compound_head) +(page, flags|_refcount|mapping|lru|_mapcount|private|compound_order|compound_info) ---------------------------------------------------------------------------------- User-space tools compute their values based on the offset of these diff --git a/Documentation/mm/vmemmap_dedup.rst b/Documentation/mm/vmemmap_dedup.rst index b4a55b6569fa..1863d88d2dcb 100644 --- a/Documentation/mm/vmemmap_dedup.rst +++ b/Documentation/mm/vmemmap_dedup.rst @@ -24,7 +24,7 @@ For each base page, there is a corresponding ``struct page``. Within the HugeTLB subsystem, only the first 4 ``struct page`` are used to contain unique information about a HugeTLB page. ``__NR_USED_SUBPAGE`` provides this upper limit. The only 'useful' information in the remaining ``struct page`` -is the compound_head field, and this field is the same for all tail pages. +is the compound_info field, and this field is the same for all tail pages. By removing redundant ``struct page`` for HugeTLB pages, memory can be returned to the buddy allocator for other uses. @@ -124,10 +124,10 @@ Here is how things look before optimization:: | | +-----------+ -The value of page->compound_head is the same for all tail pages. The first +The value of page->compound_info is the same for all tail pages. The first page of ``struct page`` (page 0) associated with the HugeTLB page contains the 4 ``struct page`` necessary to describe the HugeTLB. The only use of the remaining -pages of ``struct page`` (page 1 to page 7) is to point to page->compound_head. +pages of ``struct page`` (page 1 to page 7) is to point to page->compound_info. Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of ``struct page`` will be used for each HugeTLB page. This will allow us to free the remaining 7 pages to the buddy allocator. diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3cc8ae722886..7bc82a2b889f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -126,14 +126,14 @@ struct page { atomic_long_t pp_ref_count; }; struct { /* Tail pages of compound page */ - unsigned long compound_head; /* Bit zero is set */ + unsigned long compound_info; /* Bit zero is set */ }; struct { /* ZONE_DEVICE pages */ /* - * The first word is used for compound_head or folio + * The first word is used for compound_info or folio * pgmap */ - void *_unused_pgmap_compound_head; + void *_unused_pgmap_compound_info; void *zone_device_data; /* * ZONE_DEVICE private pages are counted as being @@ -409,7 +409,7 @@ struct folio { /* private: avoid cluttering the output */ /* For the Unevictable "LRU list" slot */ struct { - /* Avoid compound_head */ + /* Avoid compound_info */ void *__filler; /* public: */ unsigned int mlock_count; @@ -510,7 +510,7 @@ struct folio { FOLIO_MATCH(flags, flags); FOLIO_MATCH(lru, lru); FOLIO_MATCH(mapping, mapping); -FOLIO_MATCH(compound_head, lru); +FOLIO_MATCH(compound_info, lru); FOLIO_MATCH(__folio_index, index); FOLIO_MATCH(private, private); FOLIO_MATCH(_mapcount, _mapcount); @@ -529,7 +529,7 @@ FOLIO_MATCH(_last_cpupid, _last_cpupid); static_assert(offsetof(struct folio, fl) == \ offsetof(struct page, pg) + sizeof(struct page)) FOLIO_MATCH(flags, _flags_1); -FOLIO_MATCH(compound_head, _head_1); +FOLIO_MATCH(compound_info, _head_1); FOLIO_MATCH(_mapcount, _mapcount_1); FOLIO_MATCH(_refcount, _refcount_1); #undef FOLIO_MATCH @@ -537,13 +537,13 @@ FOLIO_MATCH(_refcount, _refcount_1); static_assert(offsetof(struct folio, fl) == \ offsetof(struct page, pg) + 2 * sizeof(struct page)) FOLIO_MATCH(flags, _flags_2); -FOLIO_MATCH(compound_head, _head_2); +FOLIO_MATCH(compound_info, _head_2); #undef FOLIO_MATCH #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct folio, fl) == \ offsetof(struct page, pg) + 3 * sizeof(struct page)) FOLIO_MATCH(flags, _flags_3); -FOLIO_MATCH(compound_head, _head_3); +FOLIO_MATCH(compound_info, _head_3); #undef FOLIO_MATCH /** @@ -609,8 +609,8 @@ struct ptdesc { #define TABLE_MATCH(pg, pt) \ static_assert(offsetof(struct page, pg) == offsetof(struct ptdesc, pt)) TABLE_MATCH(flags, pt_flags); -TABLE_MATCH(compound_head, pt_list); -TABLE_MATCH(compound_head, _pt_pad_1); +TABLE_MATCH(compound_info, pt_list); +TABLE_MATCH(compound_info, _pt_pad_1); TABLE_MATCH(mapping, __page_mapping); TABLE_MATCH(__folio_index, pt_index); TABLE_MATCH(rcu_head, pt_rcu_head); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 7729a4a28b44..265a798295ff 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -213,7 +213,7 @@ static __always_inline const struct page *page_fixed_fake_head(const struct page /* * Only addresses aligned with PAGE_SIZE of struct page may be fake head * struct page. The alignment check aims to avoid access the fields ( - * e.g. compound_head) of the @page[1]. It can avoid touch a (possibly) + * e.g. compound_info) of the @page[1]. It can avoid touch a (possibly) * cold cacheline in some cases. */ if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) && @@ -223,7 +223,7 @@ static __always_inline const struct page *page_fixed_fake_head(const struct page * because the @page is a compound page composed with at least * two contiguous pages. */ - unsigned long head = READ_ONCE(page[1].compound_head); + unsigned long head = READ_ONCE(page[1].compound_info); if (likely(head & 1)) return (const struct page *)(head - 1); @@ -281,7 +281,7 @@ static __always_inline int page_is_fake_head(const struct page *page) static __always_inline unsigned long _compound_head(const struct page *page) { - unsigned long head = READ_ONCE(page->compound_head); + unsigned long head = READ_ONCE(page->compound_info); if (unlikely(head & 1)) return head - 1; @@ -320,13 +320,13 @@ static __always_inline unsigned long _compound_head(const struct page *page) static __always_inline int PageTail(const struct page *page) { - return READ_ONCE(page->compound_head) & 1 || page_is_fake_head(page); + return READ_ONCE(page->compound_info) & 1 || page_is_fake_head(page); } static __always_inline int PageCompound(const struct page *page) { return test_bit(PG_head, &page->flags.f) || - READ_ONCE(page->compound_head) & 1; + READ_ONCE(page->compound_info) & 1; } #define PAGE_POISON_PATTERN -1l @@ -348,7 +348,7 @@ static const unsigned long *const_folio_flags(const struct folio *folio, { const struct page *page = &folio->page; - VM_BUG_ON_PGFLAGS(page->compound_head & 1, page); + VM_BUG_ON_PGFLAGS(page->compound_info & 1, page); VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page); return &page[n].flags.f; } @@ -357,7 +357,7 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n) { struct page *page = &folio->page; - VM_BUG_ON_PGFLAGS(page->compound_head & 1, page); + VM_BUG_ON_PGFLAGS(page->compound_info & 1, page); VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page); return &page[n].flags.f; } @@ -873,12 +873,12 @@ static inline bool folio_test_large(const struct folio *folio) static __always_inline void set_compound_head(struct page *tail, const struct page *head, unsigned int order) { - WRITE_ONCE(tail->compound_head, (unsigned long)head + 1); + WRITE_ONCE(tail->compound_info, (unsigned long)head + 1); } static __always_inline void clear_compound_head(struct page *page) { - WRITE_ONCE(page->compound_head, 0); + WRITE_ONCE(page->compound_info, 0); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/include/linux/types.h b/include/linux/types.h index 7e71d260763c..608050dbca6a 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -239,7 +239,7 @@ struct ustat { * * This guarantee is important for few reasons: * - future call_rcu_lazy() will make use of lower bits in the pointer; - * - the structure shares storage space in struct page with @compound_head, + * - the structure shares storage space in struct page with @compound_info, * which encode PageTail() in bit 0. The guarantee is needed to avoid * false-positive PageTail(). */ diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c index 8d82913223a1..94e4ef75b1b2 100644 --- a/kernel/vmcore_info.c +++ b/kernel/vmcore_info.c @@ -198,7 +198,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(page, lru); VMCOREINFO_OFFSET(page, _mapcount); VMCOREINFO_OFFSET(page, private); - VMCOREINFO_OFFSET(page, compound_head); + VMCOREINFO_OFFSET(page, compound_info); VMCOREINFO_OFFSET(pglist_data, node_zones); VMCOREINFO_OFFSET(pglist_data, nr_zones); #ifdef CONFIG_FLATMEM diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 11f9a0525a3a..f4f9a98bb425 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -731,7 +731,7 @@ static inline bool pcp_allowed_order(unsigned int order) * The first PAGE_SIZE page is called the "head page" and have PG_head set. * * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded - * in bit 0 of page->compound_head. The rest of bits is pointer to head page. + * in bit 0 of page->compound_info. The rest of bits is pointer to head page. * * The first tail page's ->compound_order holds the order of allocation. * This usage means that zero-order pages may not be compound. diff --git a/mm/slab.h b/mm/slab.h index e9ab292acd22..0653cf5fd93a 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -94,7 +94,7 @@ struct slab { #define SLAB_MATCH(pg, sl) \ static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl)) SLAB_MATCH(flags, flags); -SLAB_MATCH(compound_head, slab_cache); /* Ensure bit 0 is clear */ +SLAB_MATCH(compound_info, slab_cache); /* Ensure bit 0 is clear */ SLAB_MATCH(_refcount, __page_refcount); #ifdef CONFIG_MEMCG SLAB_MATCH(memcg_data, obj_exts); diff --git a/mm/util.c b/mm/util.c index 419cb81ab353..52400a3c5eb4 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1276,7 +1276,7 @@ void snapshot_page(struct page_snapshot *ps, const struct page *page) again: memset(&ps->folio_snapshot, 0, sizeof(struct folio)); memcpy(&ps->page_snapshot, page, sizeof(*page)); - head = ps->page_snapshot.compound_head; + head = ps->page_snapshot.compound_info; if ((head & 1) == 0) { ps->idx = 0; foliop = (struct folio *)&ps->page_snapshot; -- cgit v1.2.3 From 67c79a5af051f57339ecf383d3f67e200741ce20 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:42 +0000 Subject: mm: move set/clear_compound_head() next to compound_head() Move set_compound_head() and clear_compound_head() to be adjacent to the compound_head() function in page-flags.h. These functions encode and decode the same compound_info field, so keeping them together makes it easier to verify their logic is consistent, especially when the encoding changes. Link: https://lkml.kernel.org/r/20260227194302.274384-5-kas@kernel.org Signed-off-by: Kiryl Shutsemau Reviewed-by: Muchun Song Reviewed-by: Zi Yan Acked-by: David Hildenbrand (arm) Reviewed-by: Vlastimil Babka Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: WANG Xuerui Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 265a798295ff..5c469d38dd69 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -290,6 +290,17 @@ static __always_inline unsigned long _compound_head(const struct page *page) #define compound_head(page) ((typeof(page))_compound_head(page)) +static __always_inline void set_compound_head(struct page *tail, + const struct page *head, unsigned int order) +{ + WRITE_ONCE(tail->compound_info, (unsigned long)head + 1); +} + +static __always_inline void clear_compound_head(struct page *page) +{ + WRITE_ONCE(page->compound_info, 0); +} + /** * page_folio - Converts from page to folio. * @p: The page. @@ -870,17 +881,6 @@ static inline bool folio_test_large(const struct folio *folio) return folio_test_head(folio); } -static __always_inline void set_compound_head(struct page *tail, - const struct page *head, unsigned int order) -{ - WRITE_ONCE(tail->compound_info, (unsigned long)head + 1); -} - -static __always_inline void clear_compound_head(struct page *page) -{ - WRITE_ONCE(page->compound_info, 0); -} - #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void ClearPageCompound(struct page *page) { -- cgit v1.2.3 From 476849b0fba4450f5adf22196bcff9c24c673bc4 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:43 +0000 Subject: riscv/mm: align vmemmap to maximal folio size The upcoming change to the HugeTLB vmemmap optimization (HVO) requires struct pages of the head page to be naturally aligned with regard to the folio size. Align vmemmap to the newly introduced MAX_FOLIO_VMEMMAP_ALIGN. Link: https://lkml.kernel.org/r/20260227194302.274384-6-kas@kernel.org Signed-off-by: Kiryl Shutsemau Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Hildenbrand (arm) Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: Vlastimil Babka Cc: WANG Xuerui Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/riscv/mm/init.c | 3 ++- include/linux/mmzone.h | 11 +++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 017bad735d47..b5c50956bb8a 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -63,7 +63,8 @@ phys_addr_t phys_ram_base __ro_after_init; EXPORT_SYMBOL(phys_ram_base); #ifdef CONFIG_SPARSEMEM_VMEMMAP -#define VMEMMAP_ADDR_ALIGN (1ULL << SECTION_SIZE_BITS) +#define VMEMMAP_ADDR_ALIGN max(1ULL << SECTION_SIZE_BITS, \ + MAX_FOLIO_VMEMMAP_ALIGN) unsigned long vmemmap_start_pfn __ro_after_init; EXPORT_SYMBOL(vmemmap_start_pfn); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4c481ec77da9..0bef68e41f19 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -92,6 +92,17 @@ #define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER) +/* + * HugeTLB Vmemmap Optimization (HVO) requires struct pages of the head page to + * be naturally aligned with regard to the folio size. + * + * HVO which is only active if the size of struct page is a power of 2. + */ +#define MAX_FOLIO_VMEMMAP_ALIGN \ + (IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) && \ + is_power_of_2(sizeof(struct page)) ? \ + MAX_FOLIO_NR_PAGES * sizeof(struct page) : 0) + enum migratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, -- cgit v1.2.3 From 8c846c879e226c312c2c7a7bc1e323779903530f Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:45 +0000 Subject: mm: rework compound_head() for power-of-2 sizeof(struct page) For tail pages, the kernel uses the 'compound_info' field to get to the head page. The bit 0 of the field indicates whether the page is a tail page, and if set, the remaining bits represent a pointer to the head page. For cases when size of struct page is power-of-2, change the encoding of compound_info to store a mask that can be applied to the virtual address of the tail page in order to access the head page. It is possible because struct page of the head page is naturally aligned with regards to order of the page. The significant impact of this modification is that all tail pages of the same order will now have identical 'compound_info', regardless of the compound page they are associated with. This paves the way for eliminating fake heads. The HugeTLB Vmemmap Optimization (HVO) creates fake heads and it is only applied when the sizeof(struct page) is power-of-2. Having identical tail pages allows the same page to be mapped into the vmemmap of all pages, maintaining memory savings without fake heads. If sizeof(struct page) is not power-of-2, there is no functional changes. Limit mask usage to HugeTLB vmemmap optimization (HVO) where it makes a difference. The approach with mask would work in the wider set of conditions, but it requires validating that struct pages are naturally aligned for all orders up to the MAX_FOLIO_ORDER, which can be tricky. Link: https://lkml.kernel.org/r/20260227194302.274384-8-kas@kernel.org Signed-off-by: Kiryl Shutsemau Reviewed-by: Muchun Song Reviewed-by: Zi Yan Acked-by: David Hildenbrand (Arm) Acked-by: Usama Arif Reviewed-by: Vlastimil Babka Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: WANG Xuerui Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 81 +++++++++++++++++++++++++++++++++++++++++----- mm/slab.h | 16 ++++++--- mm/util.c | 16 ++++++--- 3 files changed, 97 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5c469d38dd69..43876b108f0a 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -198,6 +198,29 @@ enum pageflags { #ifndef __GENERATING_BOUNDS_H +/* + * For tail pages, if the size of struct page is power-of-2 ->compound_info + * encodes the mask that converts the address of the tail page address to + * the head page address. + * + * Otherwise, ->compound_info has direct pointer to head pages. + */ +static __always_inline bool compound_info_has_mask(void) +{ + /* + * Limit mask usage to HugeTLB vmemmap optimization (HVO) where it + * makes a difference. + * + * The approach with mask would work in the wider set of conditions, + * but it requires validating that struct pages are naturally aligned + * for all orders up to the MAX_FOLIO_ORDER, which can be tricky. + */ + if (!IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP)) + return false; + + return is_power_of_2(sizeof(struct page)); +} + #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); @@ -207,6 +230,10 @@ DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); */ static __always_inline const struct page *page_fixed_fake_head(const struct page *page) { + /* Fake heads only exists if compound_info_has_mask() is true */ + if (!compound_info_has_mask()) + return page; + if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key)) return page; @@ -223,10 +250,14 @@ static __always_inline const struct page *page_fixed_fake_head(const struct page * because the @page is a compound page composed with at least * two contiguous pages. */ - unsigned long head = READ_ONCE(page[1].compound_info); + unsigned long info = READ_ONCE(page[1].compound_info); + + /* See set_compound_head() */ + if (likely(info & 1)) { + unsigned long p = (unsigned long)page; - if (likely(head & 1)) - return (const struct page *)(head - 1); + return (const struct page *)(p & info); + } } return page; } @@ -281,11 +312,26 @@ static __always_inline int page_is_fake_head(const struct page *page) static __always_inline unsigned long _compound_head(const struct page *page) { - unsigned long head = READ_ONCE(page->compound_info); + unsigned long info = READ_ONCE(page->compound_info); - if (unlikely(head & 1)) - return head - 1; - return (unsigned long)page_fixed_fake_head(page); + /* Bit 0 encodes PageTail() */ + if (!(info & 1)) + return (unsigned long)page_fixed_fake_head(page); + + /* + * If compound_info_has_mask() is false, the rest of compound_info is + * the pointer to the head page. + */ + if (!compound_info_has_mask()) + return info - 1; + + /* + * If compound_info_has_mask() is true the rest of the info encodes + * the mask that converts the address of the tail page to the head page. + * + * No need to clear bit 0 in the mask as 'page' always has it clear. + */ + return (unsigned long)page & info; } #define compound_head(page) ((typeof(page))_compound_head(page)) @@ -293,7 +339,26 @@ static __always_inline unsigned long _compound_head(const struct page *page) static __always_inline void set_compound_head(struct page *tail, const struct page *head, unsigned int order) { - WRITE_ONCE(tail->compound_info, (unsigned long)head + 1); + unsigned int shift; + unsigned long mask; + + if (!compound_info_has_mask()) { + WRITE_ONCE(tail->compound_info, (unsigned long)head | 1); + return; + } + + /* + * If the size of struct page is power-of-2, bits [shift:0] of the + * virtual address of compound head are zero. + * + * Calculate mask that can be applied to the virtual address of + * the tail page to get address of the head page. + */ + shift = order + order_base_2(sizeof(struct page)); + mask = GENMASK(BITS_PER_LONG - 1, shift); + + /* Bit 0 encodes PageTail() */ + WRITE_ONCE(tail->compound_info, mask | 1); } static __always_inline void clear_compound_head(struct page *page) diff --git a/mm/slab.h b/mm/slab.h index 0653cf5fd93a..ccbdbed18c05 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -131,11 +131,19 @@ static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(struct freelist */ static inline struct slab *page_slab(const struct page *page) { - unsigned long head; + unsigned long info; + + info = READ_ONCE(page->compound_info); + if (info & 1) { + /* See compound_head() */ + if (compound_info_has_mask()) { + unsigned long p = (unsigned long)page; + page = (struct page *)(p & info); + } else { + page = (struct page *)(info - 1); + } + } - head = READ_ONCE(page->compound_head); - if (head & 1) - page = (struct page *)(head - 1); if (data_race(page->page_type >> 24) != PGTY_slab) page = NULL; diff --git a/mm/util.c b/mm/util.c index 52400a3c5eb4..ce7ae80047cf 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1266,7 +1266,7 @@ static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio, */ void snapshot_page(struct page_snapshot *ps, const struct page *page) { - unsigned long head, nr_pages = 1; + unsigned long info, nr_pages = 1; struct folio *foliop; int loops = 5; @@ -1276,8 +1276,8 @@ void snapshot_page(struct page_snapshot *ps, const struct page *page) again: memset(&ps->folio_snapshot, 0, sizeof(struct folio)); memcpy(&ps->page_snapshot, page, sizeof(*page)); - head = ps->page_snapshot.compound_info; - if ((head & 1) == 0) { + info = ps->page_snapshot.compound_info; + if (!(info & 1)) { ps->idx = 0; foliop = (struct folio *)&ps->page_snapshot; if (!folio_test_large(foliop)) { @@ -1288,7 +1288,15 @@ again: } foliop = (struct folio *)page; } else { - foliop = (struct folio *)(head - 1); + /* See compound_head() */ + if (compound_info_has_mask()) { + unsigned long p = (unsigned long)page; + + foliop = (struct folio *)(p & info); + } else { + foliop = (struct folio *)(info - 1); + } + ps->idx = folio_page_idx(foliop, page); } -- cgit v1.2.3 From 209e6d9eb13aaf1b6e0fc6f76afc00d055e5ba12 Mon Sep 17 00:00:00 2001 From: "Kiryl Shutsemau (Meta)" Date: Fri, 27 Feb 2026 19:42:47 +0000 Subject: mm/hugetlb: defer vmemmap population for bootmem hugepages Currently, the vmemmap for bootmem-allocated gigantic pages is populated early in hugetlb_vmemmap_init_early(). However, the zone information is only available after zones are initialized. If it is later discovered that a page spans multiple zones, the HVO mapping must be undone and replaced with a normal mapping using vmemmap_undo_hvo(). Defer the actual vmemmap population to hugetlb_vmemmap_init_late(). At this stage, zones are already initialized, so it can be checked if the page is valid for HVO before deciding how to populate the vmemmap. This allows us to remove vmemmap_undo_hvo() and the complex logic required to rollback HVO mappings. In hugetlb_vmemmap_init_late(), if HVO population fails or if the zones are invalid, fall back to a normal vmemmap population. Postponing population until hugetlb_vmemmap_init_late() also makes zone information available from within vmemmap_populate_hvo(). Link: https://lkml.kernel.org/r/20260227194302.274384-10-kas@kernel.org Signed-off-by: Kiryl Shutsemau (Meta) Acked-by: David Hildenbrand (Arm) Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: Vlastimil Babka Cc: WANG Xuerui Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 -- mm/hugetlb_vmemmap.c | 37 ++++++++++++++++++------------------ mm/sparse-vmemmap.c | 53 ---------------------------------------------------- 3 files changed, 18 insertions(+), 74 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 4e999c21d89a..d7e53532a109 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4481,8 +4481,6 @@ int vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); int vmemmap_populate_hvo(unsigned long start, unsigned long end, int node, unsigned long headsize); -int vmemmap_undo_hvo(unsigned long start, unsigned long end, int node, - unsigned long headsize); void vmemmap_wrprotect_hvo(unsigned long start, unsigned long end, int node, unsigned long headsize); void vmemmap_populate_print_last(void); diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index a9280259e12a..935ec5829be9 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -790,7 +790,6 @@ void __init hugetlb_vmemmap_init_early(int nid) { unsigned long psize, paddr, section_size; unsigned long ns, i, pnum, pfn, nr_pages; - unsigned long start, end; struct huge_bootmem_page *m = NULL; void *map; @@ -808,14 +807,6 @@ void __init hugetlb_vmemmap_init_early(int nid) paddr = virt_to_phys(m); pfn = PHYS_PFN(paddr); map = pfn_to_page(pfn); - start = (unsigned long)map; - end = start + nr_pages * sizeof(struct page); - - if (vmemmap_populate_hvo(start, end, nid, - HUGETLB_VMEMMAP_RESERVE_SIZE) < 0) - continue; - - memmap_boot_pages_add(HUGETLB_VMEMMAP_RESERVE_SIZE / PAGE_SIZE); pnum = pfn_to_section_nr(pfn); ns = psize / section_size; @@ -850,28 +841,36 @@ void __init hugetlb_vmemmap_init_late(int nid) h = m->hstate; pfn = PHYS_PFN(phys); nr_pages = pages_per_huge_page(h); + map = pfn_to_page(pfn); + start = (unsigned long)map; + end = start + nr_pages * sizeof(struct page); if (!hugetlb_bootmem_page_zones_valid(nid, m)) { /* * Oops, the hugetlb page spans multiple zones. - * Remove it from the list, and undo HVO. + * Remove it from the list, and populate it normally. */ list_del(&m->list); - map = pfn_to_page(pfn); - - start = (unsigned long)map; - end = start + nr_pages * sizeof(struct page); - - vmemmap_undo_hvo(start, end, nid, - HUGETLB_VMEMMAP_RESERVE_SIZE); - nr_mmap = end - start - HUGETLB_VMEMMAP_RESERVE_SIZE; + vmemmap_populate(start, end, nid, NULL); + nr_mmap = end - start; memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE)); memblock_phys_free(phys, huge_page_size(h)); continue; - } else + } + + if (vmemmap_populate_hvo(start, end, nid, + HUGETLB_VMEMMAP_RESERVE_SIZE) < 0) { + /* Fallback if HVO population fails */ + vmemmap_populate(start, end, nid, NULL); + nr_mmap = end - start; + } else { m->flags |= HUGE_BOOTMEM_ZONES_VALID; + nr_mmap = HUGETLB_VMEMMAP_RESERVE_SIZE; + } + + memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE)); } } #endif diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 37522d6cb398..032a81450838 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -302,59 +302,6 @@ int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end, return vmemmap_populate_range(start, end, node, altmap, -1, 0); } -/* - * Undo populate_hvo, and replace it with a normal base page mapping. - * Used in memory init in case a HVO mapping needs to be undone. - * - * This can happen when it is discovered that a memblock allocated - * hugetlb page spans multiple zones, which can only be verified - * after zones have been initialized. - * - * We know that: - * 1) The first @headsize / PAGE_SIZE vmemmap pages were individually - * allocated through memblock, and mapped. - * - * 2) The rest of the vmemmap pages are mirrors of the last head page. - */ -int __meminit vmemmap_undo_hvo(unsigned long addr, unsigned long end, - int node, unsigned long headsize) -{ - unsigned long maddr, pfn; - pte_t *pte; - int headpages; - - /* - * Should only be called early in boot, so nothing will - * be accessing these page structures. - */ - WARN_ON(!early_boot_irqs_disabled); - - headpages = headsize >> PAGE_SHIFT; - - /* - * Clear mirrored mappings for tail page structs. - */ - for (maddr = addr + headsize; maddr < end; maddr += PAGE_SIZE) { - pte = virt_to_kpte(maddr); - pte_clear(&init_mm, maddr, pte); - } - - /* - * Clear and free mappings for head page and first tail page - * structs. - */ - for (maddr = addr; headpages-- > 0; maddr += PAGE_SIZE) { - pte = virt_to_kpte(maddr); - pfn = pte_pfn(ptep_get(pte)); - pte_clear(&init_mm, maddr, pte); - memblock_phys_free(PFN_PHYS(pfn), PAGE_SIZE); - } - - flush_tlb_kernel_range(addr, end); - - return vmemmap_populate(addr, end, node, NULL); -} - /* * Write protect the mirrored tail page structs for HVO. This will be * called from the hugetlb code when gathering and initializing the -- cgit v1.2.3 From 622026e87c4019e609010811757e31193cc23847 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:50 +0000 Subject: mm/hugetlb: remove fake head pages HugeTLB Vmemmap Optimization (HVO) reduces memory usage by freeing most vmemmap pages for huge pages and remapping the freed range to a single page containing the struct page metadata. With the new mask-based compound_info encoding (for power-of-2 struct page sizes), all tail pages of the same order are now identical regardless of which compound page they belong to. This means the tail pages can be truly shared without fake heads. Allocate a single page of initialized tail struct pages per zone per order in the vmemmap_tails[] array in struct zone. All huge pages of that order in the zone share this tail page, mapped read-only into their vmemmap. The head page remains unique per huge page. Redefine MAX_FOLIO_ORDER using ilog2(). The define has to produce a compile-constant as it is used to specify vmemmap_tail array size. For some reason, compiler is not able to solve get_order() at compile-time, but ilog2() works. Avoid PUD_ORDER to define MAX_FOLIO_ORDER as it adds dependency to which generates hard-to-break include loop. This eliminates fake heads while maintaining the same memory savings, and simplifies compound_head() by removing fake head detection. Link: https://lkml.kernel.org/r/20260227194302.274384-13-kas@kernel.org Signed-off-by: Kiryl Shutsemau Reviewed-by: Vlastimil Babka (SUSE) Acked-by: David Hildenbrand (Arm) Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: WANG Xuerui Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 ++- include/linux/mmzone.h | 19 +++++++++++-- mm/hugetlb_vmemmap.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++--- mm/internal.h | 9 +++++++ mm/sparse-vmemmap.c | 55 +++++++++++++++++++++++++++++++------ 5 files changed, 145 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index d7e53532a109..19619e5efeba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4479,7 +4479,8 @@ int vmemmap_populate_hugepages(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); int vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); -int vmemmap_populate_hvo(unsigned long start, unsigned long end, int node, +int vmemmap_populate_hvo(unsigned long start, unsigned long end, + unsigned int order, struct zone *zone, unsigned long headsize); void vmemmap_wrprotect_hvo(unsigned long start, unsigned long end, int node, unsigned long headsize); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0bef68e41f19..5c3ae0348754 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -81,13 +81,17 @@ * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit. */ -#define MAX_FOLIO_ORDER get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G) +#ifdef CONFIG_64BIT +#define MAX_FOLIO_ORDER (ilog2(SZ_16G) - PAGE_SHIFT) +#else +#define MAX_FOLIO_ORDER (ilog2(SZ_1G) - PAGE_SHIFT) +#endif #else /* * Without hugetlb, gigantic folios that are bigger than a single PUD are * currently impossible. */ -#define MAX_FOLIO_ORDER PUD_ORDER +#define MAX_FOLIO_ORDER (PUD_SHIFT - PAGE_SHIFT) #endif #define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER) @@ -103,6 +107,14 @@ is_power_of_2(sizeof(struct page)) ? \ MAX_FOLIO_NR_PAGES * sizeof(struct page) : 0) +/* + * vmemmap optimization (like HVO) is only possible for page orders that fill + * two or more pages with struct pages. + */ +#define VMEMMAP_TAIL_MIN_ORDER (ilog2(2 * PAGE_SIZE / sizeof(struct page))) +#define __NR_VMEMMAP_TAILS (MAX_FOLIO_ORDER - VMEMMAP_TAIL_MIN_ORDER + 1) +#define NR_VMEMMAP_TAILS (__NR_VMEMMAP_TAILS > 0 ? __NR_VMEMMAP_TAILS : 0) + enum migratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, @@ -1113,6 +1125,9 @@ struct zone { /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP + struct page *vmemmap_tails[NR_VMEMMAP_TAILS]; +#endif } ____cacheline_internodealigned_in_smp; enum pgdat_flags { diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 3628fb5b2a28..92330f172eb7 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -19,6 +19,7 @@ #include #include "hugetlb_vmemmap.h" +#include "internal.h" /** * struct vmemmap_remap_walk - walk vmemmap page table @@ -505,6 +506,32 @@ static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio * return true; } +static struct page *vmemmap_get_tail(unsigned int order, struct zone *zone) +{ + const unsigned int idx = order - VMEMMAP_TAIL_MIN_ORDER; + struct page *tail, *p; + int node = zone_to_nid(zone); + + tail = READ_ONCE(zone->vmemmap_tails[idx]); + if (likely(tail)) + return tail; + + tail = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + if (!tail) + return NULL; + + p = page_to_virt(tail); + for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++) + init_compound_tail(p + i, NULL, order, zone); + + if (cmpxchg(&zone->vmemmap_tails[idx], NULL, tail)) { + __free_page(tail); + tail = READ_ONCE(zone->vmemmap_tails[idx]); + } + + return tail; +} + static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio, struct list_head *vmemmap_pages, @@ -520,6 +547,11 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, if (!vmemmap_should_optimize_folio(h, folio)) return ret; + nid = folio_nid(folio); + vmemmap_tail = vmemmap_get_tail(h->order, folio_zone(folio)); + if (!vmemmap_tail) + return -ENOMEM; + static_branch_inc(&hugetlb_optimize_vmemmap_key); if (flags & VMEMMAP_SYNCHRONIZE_RCU) @@ -537,7 +569,6 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, */ folio_set_hugetlb_vmemmap_optimized(folio); - nid = folio_nid(folio); vmemmap_head = alloc_pages_node(nid, GFP_KERNEL, 0); if (!vmemmap_head) { ret = -ENOMEM; @@ -548,7 +579,6 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, list_add(&vmemmap_head->lru, vmemmap_pages); memmap_pages_add(1); - vmemmap_tail = vmemmap_head; vmemmap_start = (unsigned long)&folio->page; vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); @@ -776,11 +806,26 @@ void __init hugetlb_vmemmap_init_early(int nid) } } +static struct zone *pfn_to_zone(unsigned nid, unsigned long pfn) +{ + struct zone *zone; + enum zone_type zone_type; + + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { + zone = &NODE_DATA(nid)->node_zones[zone_type]; + if (zone_spans_pfn(zone, pfn)) + return zone; + } + + return NULL; +} + void __init hugetlb_vmemmap_init_late(int nid) { struct huge_bootmem_page *m, *tm; unsigned long phys, nr_pages, start, end; unsigned long pfn, nr_mmap; + struct zone *zone = NULL; struct hstate *h; void *map; @@ -814,7 +859,12 @@ void __init hugetlb_vmemmap_init_late(int nid) continue; } - if (vmemmap_populate_hvo(start, end, nid, + if (!zone || !zone_spans_pfn(zone, pfn)) + zone = pfn_to_zone(nid, pfn); + if (WARN_ON_ONCE(!zone)) + continue; + + if (vmemmap_populate_hvo(start, end, huge_page_order(h), zone, HUGETLB_VMEMMAP_RESERVE_SIZE) < 0) { /* Fallback if HVO population fails */ vmemmap_populate(start, end, nid, NULL); @@ -842,10 +892,27 @@ static const struct ctl_table hugetlb_vmemmap_sysctls[] = { static int __init hugetlb_vmemmap_init(void) { const struct hstate *h; + struct zone *zone; /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */ BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES); + for_each_zone(zone) { + for (int i = 0; i < NR_VMEMMAP_TAILS; i++) { + struct page *tail, *p; + unsigned int order; + + tail = zone->vmemmap_tails[i]; + if (!tail) + continue; + + order = i + VMEMMAP_TAIL_MIN_ORDER; + p = page_to_virt(tail); + for (int j = 0; j < PAGE_SIZE / sizeof(struct page); j++) + init_compound_tail(p + j, NULL, order, zone); + } + } + for_each_hstate(h) { if (hugetlb_vmemmap_optimizable(h)) { register_sysctl_init("vm", hugetlb_vmemmap_sysctls); diff --git a/mm/internal.h b/mm/internal.h index 9cfbd8e41914..84167b0570c9 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -905,6 +905,15 @@ static inline void prep_compound_tail(struct page *tail, set_page_private(tail, 0); } +static inline void init_compound_tail(struct page *tail, + const struct page *head, unsigned int order, struct zone *zone) +{ + atomic_set(&tail->_mapcount, -1); + set_page_node(tail, zone_to_nid(zone)); + set_page_zone(tail, zone_idx(zone)); + prep_compound_tail(tail, head, order); +} + void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern bool free_pages_prepare(struct page *page, unsigned int order); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 032a81450838..842ed2f0bce6 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -325,16 +325,54 @@ void vmemmap_wrprotect_hvo(unsigned long addr, unsigned long end, } } -/* - * Populate vmemmap pages HVO-style. The first page contains the head - * page and needed tail pages, the other ones are mirrors of the first - * page. - */ +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +static __meminit struct page *vmemmap_get_tail(unsigned int order, struct zone *zone) +{ + struct page *p, *tail; + unsigned int idx; + int node = zone_to_nid(zone); + + if (WARN_ON_ONCE(order < VMEMMAP_TAIL_MIN_ORDER)) + return NULL; + if (WARN_ON_ONCE(order > MAX_FOLIO_ORDER)) + return NULL; + + idx = order - VMEMMAP_TAIL_MIN_ORDER; + tail = zone->vmemmap_tails[idx]; + if (tail) + return tail; + + /* + * Only allocate the page, but do not initialize it. + * + * Any initialization done here will be overwritten by memmap_init(). + * + * hugetlb_vmemmap_init() will take care of initialization after + * memmap_init(). + */ + + p = vmemmap_alloc_block_zero(PAGE_SIZE, node); + if (!p) + return NULL; + + tail = virt_to_page(p); + zone->vmemmap_tails[idx] = tail; + + return tail; +} + int __meminit vmemmap_populate_hvo(unsigned long addr, unsigned long end, - int node, unsigned long headsize) + unsigned int order, struct zone *zone, + unsigned long headsize) { - pte_t *pte; unsigned long maddr; + struct page *tail; + pte_t *pte; + int node = zone_to_nid(zone); + + tail = vmemmap_get_tail(order, zone); + if (!tail) + return -ENOMEM; for (maddr = addr; maddr < addr + headsize; maddr += PAGE_SIZE) { pte = vmemmap_populate_address(maddr, node, NULL, -1, 0); @@ -346,8 +384,9 @@ int __meminit vmemmap_populate_hvo(unsigned long addr, unsigned long end, * Reuse the last page struct page mapped above for the rest. */ return vmemmap_populate_range(maddr, end, node, NULL, - pte_pfn(ptep_get(pte)), 0); + page_to_pfn(tail), 0); } +#endif void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, unsigned long addr, unsigned long next) -- cgit v1.2.3 From 32c440d67e6cd96a715007d0e62eb970b0c49abc Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:51 +0000 Subject: mm: drop fake head checks With fake head pages eliminated in the previous commit, remove the supporting infrastructure: - page_fixed_fake_head(): no longer needed to detect fake heads; - page_is_fake_head(): no longer needed; - page_count_writable(): no longer needed for RCU protection; - RCU read_lock in page_ref_add_unless(): no longer needed; This substantially simplifies compound_head() and page_ref_add_unless(), removing both branches and RCU overhead from these hot paths. RCU was required to serialize allocation of hugetlb page against get_page_unless_zero() and prevent writing to read-only fake head. It is redundant without fake heads. See bd225530a4c7 ("mm/hugetlb_vmemmap: fix race with speculative PFN walkers") for more details. synchronize_rcu() in mm/hugetlb_vmemmap.c will be removed by a separate patch. Link: https://lkml.kernel.org/r/20260227194302.274384-14-kas@kernel.org Signed-off-by: Kiryl Shutsemau Reviewed-by: Muchun Song Acked-by: David Hildenbrand (Arm) Reviewed-by: Vlastimil Babka Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: WANG Xuerui Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 93 ++-------------------------------------------- include/linux/page_ref.h | 8 +--- 2 files changed, 4 insertions(+), 97 deletions(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 43876b108f0a..b8eef2181598 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -221,102 +221,15 @@ static __always_inline bool compound_info_has_mask(void) return is_power_of_2(sizeof(struct page)); } -#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); -/* - * Return the real head page struct iff the @page is a fake head page, otherwise - * return the @page itself. See Documentation/mm/vmemmap_dedup.rst. - */ -static __always_inline const struct page *page_fixed_fake_head(const struct page *page) -{ - /* Fake heads only exists if compound_info_has_mask() is true */ - if (!compound_info_has_mask()) - return page; - - if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key)) - return page; - - /* - * Only addresses aligned with PAGE_SIZE of struct page may be fake head - * struct page. The alignment check aims to avoid access the fields ( - * e.g. compound_info) of the @page[1]. It can avoid touch a (possibly) - * cold cacheline in some cases. - */ - if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) && - test_bit(PG_head, &page->flags.f)) { - /* - * We can safely access the field of the @page[1] with PG_head - * because the @page is a compound page composed with at least - * two contiguous pages. - */ - unsigned long info = READ_ONCE(page[1].compound_info); - - /* See set_compound_head() */ - if (likely(info & 1)) { - unsigned long p = (unsigned long)page; - - return (const struct page *)(p & info); - } - } - return page; -} - -static __always_inline bool page_count_writable(const struct page *page, int u) -{ - if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key)) - return true; - - /* - * The refcount check is ordered before the fake-head check to prevent - * the following race: - * CPU 1 (HVO) CPU 2 (speculative PFN walker) - * - * page_ref_freeze() - * synchronize_rcu() - * rcu_read_lock() - * page_is_fake_head() is false - * vmemmap_remap_pte() - * XXX: struct page[] becomes r/o - * - * page_ref_unfreeze() - * page_ref_count() is not zero - * - * atomic_add_unless(&page->_refcount) - * XXX: try to modify r/o struct page[] - * - * The refcount check also prevents modification attempts to other (r/o) - * tail pages that are not fake heads. - */ - if (atomic_read_acquire(&page->_refcount) == u) - return false; - - return page_fixed_fake_head(page) == page; -} -#else -static inline const struct page *page_fixed_fake_head(const struct page *page) -{ - return page; -} - -static inline bool page_count_writable(const struct page *page, int u) -{ - return true; -} -#endif - -static __always_inline int page_is_fake_head(const struct page *page) -{ - return page_fixed_fake_head(page) != page; -} - static __always_inline unsigned long _compound_head(const struct page *page) { unsigned long info = READ_ONCE(page->compound_info); /* Bit 0 encodes PageTail() */ if (!(info & 1)) - return (unsigned long)page_fixed_fake_head(page); + return (unsigned long)page; /* * If compound_info_has_mask() is false, the rest of compound_info is @@ -396,7 +309,7 @@ static __always_inline void clear_compound_head(struct page *page) static __always_inline int PageTail(const struct page *page) { - return READ_ONCE(page->compound_info) & 1 || page_is_fake_head(page); + return READ_ONCE(page->compound_info) & 1; } static __always_inline int PageCompound(const struct page *page) @@ -928,7 +841,7 @@ static __always_inline bool folio_test_head(const struct folio *folio) static __always_inline int PageHead(const struct page *page) { PF_POISONED_CHECK(page); - return test_bit(PG_head, &page->flags.f) && !page_is_fake_head(page); + return test_bit(PG_head, &page->flags.f); } __SETPAGEFLAG(Head, head, PF_ANY) diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index 544150d1d5fd..490d0ad6e56d 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -230,13 +230,7 @@ static inline int folio_ref_dec_return(struct folio *folio) static inline bool page_ref_add_unless(struct page *page, int nr, int u) { - bool ret = false; - - rcu_read_lock(); - /* avoid writing to the vmemmap area being remapped */ - if (page_count_writable(page, u)) - ret = atomic_add_unless(&page->_refcount, nr, u); - rcu_read_unlock(); + bool ret = atomic_add_unless(&page->_refcount, nr, u); if (page_ref_tracepoint_active(page_ref_mod_unless)) __page_ref_mod_unless(page, nr, ret); -- cgit v1.2.3 From da3e2d1ca43de56a83a806237b6be7e91cf07052 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:53 +0000 Subject: mm/hugetlb: remove hugetlb_optimize_vmemmap_key static key The hugetlb_optimize_vmemmap_key static key was used to guard fake head detection in compound_head() and related functions. It allowed skipping the fake head checks entirely when HVO was not in use. With fake heads eliminated and the detection code removed, the static key serves no purpose. Remove its definition and all increment/decrement calls. Link: https://lkml.kernel.org/r/20260227194302.274384-16-kas@kernel.org Signed-off-by: Kiryl Shutsemau Reviewed-by: Muchun Song Acked-by: David Hildenbrand (Arm) Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: Vlastimil Babka Cc: WANG Xuerui Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 2 -- mm/hugetlb_vmemmap.c | 14 ++------------ 2 files changed, 2 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index b8eef2181598..f361bd6c814c 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -221,8 +221,6 @@ static __always_inline bool compound_info_has_mask(void) return is_power_of_2(sizeof(struct page)); } -DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); - static __always_inline unsigned long _compound_head(const struct page *page) { unsigned long info = READ_ONCE(page->compound_info); diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index fd1d5d5d12b4..4a077d231d3a 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -385,9 +385,6 @@ static int vmemmap_remap_alloc(unsigned long start, unsigned long end, return vmemmap_remap_range(start, end, &walk); } -DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); -EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); - static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); static int __init hugetlb_vmemmap_optimize_param(char *buf) { @@ -419,10 +416,8 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, * discarded vmemmap pages must be allocated and remapping. */ ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, flags); - if (!ret) { + if (!ret) folio_clear_hugetlb_vmemmap_optimized(folio); - static_branch_dec(&hugetlb_optimize_vmemmap_key); - } return ret; } @@ -544,8 +539,6 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, if (!vmemmap_tail) return -ENOMEM; - static_branch_inc(&hugetlb_optimize_vmemmap_key); - /* * Very Subtle * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed @@ -581,10 +574,8 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, vmemmap_head, vmemmap_tail, vmemmap_pages, flags); out: - if (ret) { - static_branch_dec(&hugetlb_optimize_vmemmap_key); + if (ret) folio_clear_hugetlb_vmemmap_optimized(folio); - } return ret; } @@ -650,7 +641,6 @@ static void __hugetlb_vmemmap_optimize_folios(struct hstate *h, register_page_bootmem_memmap(pfn_to_section_nr(spfn), &folio->page, HUGETLB_VMEMMAP_RESERVE_SIZE); - static_branch_inc(&hugetlb_optimize_vmemmap_key); continue; } -- cgit v1.2.3 From 66b2a3d9ae460934fef5fd588077730f483e8c8c Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:54 +0000 Subject: mm: remove the branch from compound_head() The compound_head() function is a hot path. For example, the zap path calls it for every leaf page table entry. Rewrite the helper function in a branchless manner to eliminate the risk of CPU branch misprediction. Link: https://lkml.kernel.org/r/20260227194302.274384-17-kas@kernel.org Signed-off-by: Kiryl Shutsemau Reviewed-by: Muchun Song Reviewed-by: Zi Yan Acked-by: David Hildenbrand (Arm) Reviewed-by: Vlastimil Babka Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: WANG Xuerui Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index f361bd6c814c..7223f6f4e2b4 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -224,25 +224,32 @@ static __always_inline bool compound_info_has_mask(void) static __always_inline unsigned long _compound_head(const struct page *page) { unsigned long info = READ_ONCE(page->compound_info); + unsigned long mask; - /* Bit 0 encodes PageTail() */ - if (!(info & 1)) - return (unsigned long)page; + if (!compound_info_has_mask()) { + /* Bit 0 encodes PageTail() */ + if (info & 1) + return info - 1; - /* - * If compound_info_has_mask() is false, the rest of compound_info is - * the pointer to the head page. - */ - if (!compound_info_has_mask()) - return info - 1; + return (unsigned long)page; + } /* * If compound_info_has_mask() is true the rest of the info encodes * the mask that converts the address of the tail page to the head page. * * No need to clear bit 0 in the mask as 'page' always has it clear. + * + * Let's do it in a branchless manner. */ - return (unsigned long)page & info; + + /* Non-tail: -1UL, Tail: 0 */ + mask = (info & 1) - 1; + + /* Non-tail: -1UL, Tail: info */ + mask |= info; + + return (unsigned long)page & mask; } #define compound_head(page) ((typeof(page))_compound_head(page)) -- cgit v1.2.3 From 99573ef4ac30d4eae7a7937f0c9ea351991e3ccc Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 22:29:52 +0100 Subject: mm/pagewalk: drop FW_MIGRATION We removed the last user of FW_MIGRATION in commit 912aa825957f ("Revert "mm/ksm: convert break_ksm() from walk_page_range_vma() to folio_walk""). So let's remove FW_MIGRATION and assign FW_ZEROPAGE bit 0. Including leafops.h is no longer required. While at it, convert "expose_page" to "zeropage", as zeropages are now the only remaining use case for not exposing a page. Link: https://lkml.kernel.org/r/20260227212952.190691-1-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Cc: Lorenzo Stoakes Cc: "Liam R. Howlett" Cc: Vlastimil Babka Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/pagewalk.h | 8 +------- mm/pagewalk.c | 40 ++++++++-------------------------------- 2 files changed, 9 insertions(+), 39 deletions(-) (limited to 'include') diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index 88e18615dd72..b41d7265c01b 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -148,14 +148,8 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, typedef int __bitwise folio_walk_flags_t; -/* - * Walk migration entries as well. Careful: a large folio might get split - * concurrently. - */ -#define FW_MIGRATION ((__force folio_walk_flags_t)BIT(0)) - /* Walk shared zeropages (small + huge) as well. */ -#define FW_ZEROPAGE ((__force folio_walk_flags_t)BIT(1)) +#define FW_ZEROPAGE ((__force folio_walk_flags_t)BIT(0)) enum folio_walk_level { FW_LEVEL_PTE, diff --git a/mm/pagewalk.c b/mm/pagewalk.c index a94c401ab2cf..cb358558807c 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -5,7 +5,6 @@ #include #include #include -#include #include @@ -841,9 +840,6 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, * VM as documented by vm_normal_page(). If requested, zeropages will be * returned as well. * - * As default, this function only considers present page table entries. - * If requested, it will also consider migration entries. - * * If this function returns NULL it might either indicate "there is nothing" or * "there is nothing suitable". * @@ -854,11 +850,10 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, * that call. * * @fw->page will correspond to the page that is effectively referenced by - * @addr. However, for migration entries and shared zeropages @fw->page is - * set to NULL. Note that large folios might be mapped by multiple page table - * entries, and this function will always only lookup a single entry as - * specified by @addr, which might or might not cover more than a single page of - * the returned folio. + * @addr. However, for shared zeropages @fw->page is set to NULL. Note that + * large folios might be mapped by multiple page table entries, and this + * function will always only lookup a single entry as specified by @addr, which + * might or might not cover more than a single page of the returned folio. * * This function must *not* be used as a naive replacement for * get_user_pages() / pin_user_pages(), especially not to perform DMA or @@ -885,7 +880,7 @@ struct folio *folio_walk_start(struct folio_walk *fw, folio_walk_flags_t flags) { unsigned long entry_size; - bool expose_page = true; + bool zeropage = false; struct page *page; pud_t *pudp, pud; pmd_t *pmdp, pmd; @@ -933,10 +928,6 @@ struct folio *folio_walk_start(struct folio_walk *fw, if (page) goto found; } - /* - * TODO: FW_MIGRATION support for PUD migration entries - * once there are relevant users. - */ spin_unlock(ptl); goto not_found; } @@ -970,16 +961,9 @@ pmd_table: } else if ((flags & FW_ZEROPAGE) && is_huge_zero_pmd(pmd)) { page = pfn_to_page(pmd_pfn(pmd)); - expose_page = false; + zeropage = true; goto found; } - } else if ((flags & FW_MIGRATION) && - pmd_is_migration_entry(pmd)) { - const softleaf_t entry = softleaf_from_pmd(pmd); - - page = softleaf_to_page(entry); - expose_page = false; - goto found; } spin_unlock(ptl); goto not_found; @@ -1004,15 +988,7 @@ pte_table: if ((flags & FW_ZEROPAGE) && is_zero_pfn(pte_pfn(pte))) { page = pfn_to_page(pte_pfn(pte)); - expose_page = false; - goto found; - } - } else if (!pte_none(pte)) { - const softleaf_t entry = softleaf_from_pte(pte); - - if ((flags & FW_MIGRATION) && softleaf_is_migration(entry)) { - page = softleaf_to_page(entry); - expose_page = false; + zeropage = true; goto found; } } @@ -1021,7 +997,7 @@ not_found: vma_pgtable_walk_end(vma); return NULL; found: - if (expose_page) + if (!zeropage) /* Note: Offset from the mapped page, not the folio start. */ fw->page = page + ((addr & (entry_size - 1)) >> PAGE_SHIFT); else -- cgit v1.2.3 From 3d56d7317b271a1a5030ebb135c58aedc4c0fd36 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 27 Feb 2026 04:03:00 +0000 Subject: mm: replace READ_ONCE() in pud_trans_unstable() Replace READ_ONCE() with the existing standard page table accessor for PUD aka pudp_get() in pud_trans_unstable(). This does not create any functional change for platforms that do not override pudp_get(), which still defaults to READ_ONCE(). Link: https://lkml.kernel.org/r/20260227040300.2091901-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: David Hildenbrand (Arm) Acked-by: SeongJae Park Reviewed-by: Lorenzo Stoakes Cc: Mike Rapoport Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 776993d4567b..d2767a4c027b 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -2004,7 +2004,7 @@ static inline int pud_trans_unstable(pud_t *pud) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) - pud_t pudval = READ_ONCE(*pud); + pud_t pudval = pudp_get(pud); if (pud_none(pudval) || pud_trans_huge(pudval)) return 1; -- cgit v1.2.3 From 28266ac94a50e585c267a79d9ef5c2803d4dcd7a Mon Sep 17 00:00:00 2001 From: Gladyshev Ilya Date: Sun, 1 Mar 2026 13:19:39 +0000 Subject: mm: make ref_unless functions unless_zero only There are no users of (folio/page)_ref_add_unless(page, nr, u) with u != 0 [1] and all current users are "internal" for page refcounting API. This allows us to safely drop this parameter and reduce function semantics to the "unless zero" cases only. If needed, these functions for the u!=0 cases can be trivially reintroduced later using the same atomic_add_unless operations as before. [1]: The last user was dropped in v5.18 kernel, commit 27674ef6c73f ("mm: remove the extra ZONE_DEVICE struct page refcount"). There is no trace of discussion as to why this cleanup wasn't done earlier. Link: https://lkml.kernel.org/r/a0c89b49d38c671a0bdd35069d15ee13e08314d2.1772370066.git.gladyshev.ilya1@h-partners.com Co-developed-by: Gorbunov Ivan Signed-off-by: Gorbunov Ivan Signed-off-by: Gladyshev Ilya Acked-by: David Hildenbrand (Arm) Acked-by: Kiryl Shutsemau Acked-by: Zi Yan Reviewed-by: Lorenzo Stoakes Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Will Deacon Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- include/linux/page_ref.h | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 19619e5efeba..08b743aab92a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1506,7 +1506,7 @@ static inline int folio_put_testzero(struct folio *folio) */ static inline bool get_page_unless_zero(struct page *page) { - return page_ref_add_unless(page, 1, 0); + return page_ref_add_unless_zero(page, 1); } static inline struct folio *folio_get_nontail_page(struct page *page) diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index 490d0ad6e56d..94d3f0e71c06 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -228,18 +228,18 @@ static inline int folio_ref_dec_return(struct folio *folio) return page_ref_dec_return(&folio->page); } -static inline bool page_ref_add_unless(struct page *page, int nr, int u) +static inline bool page_ref_add_unless_zero(struct page *page, int nr) { - bool ret = atomic_add_unless(&page->_refcount, nr, u); + bool ret = atomic_add_unless(&page->_refcount, nr, 0); if (page_ref_tracepoint_active(page_ref_mod_unless)) __page_ref_mod_unless(page, nr, ret); return ret; } -static inline bool folio_ref_add_unless(struct folio *folio, int nr, int u) +static inline bool folio_ref_add_unless_zero(struct folio *folio, int nr) { - return page_ref_add_unless(&folio->page, nr, u); + return page_ref_add_unless_zero(&folio->page, nr); } /** @@ -255,12 +255,12 @@ static inline bool folio_ref_add_unless(struct folio *folio, int nr, int u) */ static inline bool folio_try_get(struct folio *folio) { - return folio_ref_add_unless(folio, 1, 0); + return folio_ref_add_unless_zero(folio, 1); } static inline bool folio_ref_try_add(struct folio *folio, int count) { - return folio_ref_add_unless(folio, count, 0); + return folio_ref_add_unless_zero(folio, count); } static inline int page_ref_freeze(struct page *page, int count) -- cgit v1.2.3 From de008c9ba5684f14e83bcf86cd45fb0e4e6c4d82 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:33 +0100 Subject: mm/memory: remove "zap_details" parameter from zap_page_range_single() Nobody except memory.c should really set that parameter to non-NULL. So let's just drop it and make unmap_mapping_range_vma() use zap_page_range_single_batched() instead. [david@kernel.org: format on a single line] Link: https://lkml.kernel.org/r/8a27e9ac-2025-4724-a46d-0a7c90894ba7@kernel.org Link: https://lkml.kernel.org/r/20260227200848.114019-3-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: Puranjay Mohan Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- arch/s390/mm/gmap_helpers.c | 2 +- drivers/android/binder_alloc.c | 2 +- include/linux/mm.h | 5 ++--- kernel/bpf/arena.c | 3 +-- kernel/events/core.c | 2 +- mm/madvise.c | 3 +-- mm/memory.c | 16 ++++++++++------ net/ipv4/tcp.c | 5 ++--- rust/kernel/mm/virt.rs | 4 +--- 9 files changed, 20 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c index dea83e3103e5..ae2d59a19313 100644 --- a/arch/s390/mm/gmap_helpers.c +++ b/arch/s390/mm/gmap_helpers.c @@ -89,7 +89,7 @@ void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned lo if (!vma) return; if (!is_vm_hugetlb_page(vma)) - zap_page_range_single(vma, vmaddr, min(end, vma->vm_end) - vmaddr, NULL); + zap_page_range_single(vma, vmaddr, min(end, vma->vm_end) - vmaddr); vmaddr = vma->vm_end; } } diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 241f16a9b63d..dd2046bd5cde 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -1185,7 +1185,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item, if (vma) { trace_binder_unmap_user_start(alloc, index); - zap_page_range_single(vma, page_addr, PAGE_SIZE, NULL); + zap_page_range_single(vma, page_addr, PAGE_SIZE); trace_binder_unmap_user_end(alloc, index); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 08b743aab92a..6512d70c5852 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2804,11 +2804,10 @@ struct page *vm_normal_page_pud(struct vm_area_struct *vma, unsigned long addr, void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, - unsigned long size, struct zap_details *details); + unsigned long size); static inline void zap_vma_pages(struct vm_area_struct *vma) { - zap_page_range_single(vma, vma->vm_start, - vma->vm_end - vma->vm_start, NULL); + zap_page_range_single(vma, vma->vm_start, vma->vm_end - vma->vm_start); } struct mmu_notifier_range; diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index f355cf1c1a16..19cca936eb9d 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -656,8 +656,7 @@ static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt) guard(mutex)(&arena->lock); /* iterate link list under lock */ list_for_each_entry(vml, &arena->vma_list, head) - zap_page_range_single(vml->vma, uaddr, - PAGE_SIZE * page_cnt, NULL); + zap_page_range_single(vml->vma, uaddr, PAGE_SIZE * page_cnt); } static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable) diff --git a/kernel/events/core.c b/kernel/events/core.c index 89b40e439717..2ecdaabf1b4d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7213,7 +7213,7 @@ static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma) #ifdef CONFIG_MMU /* Clear any partial mappings on error. */ if (err) - zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE, NULL); + zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE); #endif return err; diff --git a/mm/madvise.c b/mm/madvise.c index 1313166c5514..e4a2728593a8 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1193,8 +1193,7 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior) * OK some of the range have non-guard pages mapped, zap * them. This leaves existing guard pages in place. */ - zap_page_range_single(vma, range->start, - range->end - range->start, NULL); + zap_page_range_single(vma, range->start, range->end - range->start); } /* diff --git a/mm/memory.c b/mm/memory.c index f78ab3869f8d..fbd02d5bd520 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2203,17 +2203,16 @@ void zap_page_range_single_batched(struct mmu_gather *tlb, * @vma: vm_area_struct holding the applicable pages * @address: starting address of pages to zap * @size: number of bytes to zap - * @details: details of shared cache invalidation * * The range must fit into one VMA. */ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, - unsigned long size, struct zap_details *details) + unsigned long size) { struct mmu_gather tlb; tlb_gather_mmu(&tlb, vma->vm_mm); - zap_page_range_single_batched(&tlb, vma, address, size, details); + zap_page_range_single_batched(&tlb, vma, address, size, NULL); tlb_finish_mmu(&tlb); } @@ -2235,7 +2234,7 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, !(vma->vm_flags & VM_PFNMAP)) return; - zap_page_range_single(vma, address, size, NULL); + zap_page_range_single(vma, address, size); } EXPORT_SYMBOL_GPL(zap_vma_ptes); @@ -3003,7 +3002,7 @@ static int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long add * maintain page reference counts, and callers may free * pages due to the error. So zap it early. */ - zap_page_range_single(vma, addr, size, NULL); + zap_page_range_single(vma, addr, size); return error; } @@ -4226,7 +4225,12 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, struct zap_details *details) { - zap_page_range_single(vma, start_addr, end_addr - start_addr, details); + struct mmu_gather tlb; + + tlb_gather_mmu(&tlb, vma->vm_mm); + zap_page_range_single_batched(&tlb, vma, start_addr, + end_addr - start_addr, details); + tlb_finish_mmu(&tlb); } static inline void unmap_mapping_range_tree(struct rb_root_cached *root, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 202a4e57a218..89c962672e51 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2105,7 +2105,7 @@ static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, maybe_zap_len = total_bytes_to_map - /* All bytes to map */ *length + /* Mapped or pending */ (pages_remaining * PAGE_SIZE); /* Failed map. */ - zap_page_range_single(vma, *address, maybe_zap_len, NULL); + zap_page_range_single(vma, *address, maybe_zap_len); err = 0; } @@ -2270,8 +2270,7 @@ static int tcp_zerocopy_receive(struct sock *sk, total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1); if (total_bytes_to_map) { if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT)) - zap_page_range_single(vma, address, total_bytes_to_map, - NULL); + zap_page_range_single(vma, address, total_bytes_to_map); zc->length = total_bytes_to_map; zc->recv_skip_hint = 0; } else { diff --git a/rust/kernel/mm/virt.rs b/rust/kernel/mm/virt.rs index da21d65ccd20..6bfd91cfa1f4 100644 --- a/rust/kernel/mm/virt.rs +++ b/rust/kernel/mm/virt.rs @@ -123,9 +123,7 @@ impl VmaRef { // SAFETY: By the type invariants, the caller has read access to this VMA, which is // sufficient for this method call. This method has no requirements on the vma flags. The // address range is checked to be within the vma. - unsafe { - bindings::zap_page_range_single(self.as_ptr(), address, size, core::ptr::null_mut()) - }; + unsafe { bindings::zap_page_range_single(self.as_ptr(), address, size) }; } /// If the [`VM_MIXEDMAP`] flag is set, returns a [`VmaMixedMap`] to this VMA, otherwise -- cgit v1.2.3 From 599a59e6037838ea7cd6264d7980ea63de244994 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:35 +0100 Subject: mm/memory: simplify calculation in unmap_mapping_range_tree() Let's simplify the calculation a bit further to make it easier to get, reusing vma_last_pgoff() which we move from interval_tree.c to mm.h. Link: https://lkml.kernel.org/r/20260227200848.114019-5-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +++++ mm/interval_tree.c | 5 ----- mm/memory.c | 12 +++++------- 3 files changed, 10 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6512d70c5852..771d021b7948 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3969,6 +3969,11 @@ static inline unsigned long vma_pages(const struct vm_area_struct *vma) return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } +static inline unsigned long vma_last_pgoff(struct vm_area_struct *vma) +{ + return vma->vm_pgoff + vma_pages(vma) - 1; +} + static inline unsigned long vma_desc_size(const struct vm_area_desc *desc) { return desc->end - desc->start; diff --git a/mm/interval_tree.c b/mm/interval_tree.c index 32e390c42c53..32bcfbfcf15f 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c @@ -15,11 +15,6 @@ static inline unsigned long vma_start_pgoff(struct vm_area_struct *v) return v->vm_pgoff; } -static inline unsigned long vma_last_pgoff(struct vm_area_struct *v) -{ - return v->vm_pgoff + vma_pages(v) - 1; -} - INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb, unsigned long, shared.rb_subtree_last, vma_start_pgoff, vma_last_pgoff, /* empty */, vma_interval_tree) diff --git a/mm/memory.c b/mm/memory.c index f1c5d6b01a62..24b768885379 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4227,17 +4227,15 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root, struct zap_details *details) { struct vm_area_struct *vma; - pgoff_t vba, vea, zba, zea; unsigned long start, size; struct mmu_gather tlb; vma_interval_tree_foreach(vma, root, first_index, last_index) { - vba = vma->vm_pgoff; - vea = vba + vma_pages(vma) - 1; - zba = max(first_index, vba); - zea = min(last_index, vea); - start = ((zba - vba) << PAGE_SHIFT) + vma->vm_start; - size = (zea - zba + 1) << PAGE_SHIFT; + const pgoff_t start_idx = max(first_index, vma->vm_pgoff); + const pgoff_t end_idx = min(last_index, vma_last_pgoff(vma)) + 1; + + start = vma->vm_start + ((start_idx - vma->vm_pgoff) << PAGE_SHIFT); + size = (end_idx - start_idx) << PAGE_SHIFT; tlb_gather_mmu(&tlb, vma->vm_mm); zap_page_range_single_batched(&tlb, vma, start, size, details); -- cgit v1.2.3 From a97bc13d15f472c7f8ede1b38660fb55b6dab68d Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:40 +0100 Subject: mm/memory: convert details->even_cows into details->skip_cows The current semantics are confusing: simply because someone specifies an empty zap_detail struct suddenly makes should_zap_cows() behave differently. The default should be to also zap CoW'ed anonymous pages. Really only unmap_mapping_pages() and friends want to skip zapping of these anon folios. So let's invert the meaning; turn the confusing "reclaim_pt" check that overrides other properties in should_zap_cows() into a safety check. Note that the only caller that sets reclaim_pt=true is madvise_dontneed_single_vma(), which wants to zap any pages. Link: https://lkml.kernel.org/r/20260227200848.114019-10-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- mm/madvise.c | 1 - mm/memory.c | 12 ++++++------ 3 files changed, 7 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 771d021b7948..cb4f5fbccaf0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2767,7 +2767,7 @@ extern void pagefault_out_of_memory(void); */ struct zap_details { struct folio *single_folio; /* Locked folio to be unmapped */ - bool even_cows; /* Zap COWed private pages too? */ + bool skip_cows; /* Do not zap COWed private pages */ bool reclaim_pt; /* Need reclaim page tables? */ zap_flags_t zap_flags; /* Extra flags for zapping */ }; diff --git a/mm/madvise.c b/mm/madvise.c index e4a2728593a8..e86228682842 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -853,7 +853,6 @@ static long madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior) struct madvise_behavior_range *range = &madv_behavior->range; struct zap_details details = { .reclaim_pt = true, - .even_cows = true, }; zap_page_range_single_batched( diff --git a/mm/memory.c b/mm/memory.c index 7e5d52534ee9..c66b7b8b47eb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1554,11 +1554,13 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) static inline bool should_zap_cows(struct zap_details *details) { /* By default, zap all pages */ - if (!details || details->reclaim_pt) + if (!details) return true; + VM_WARN_ON_ONCE(details->skip_cows && details->reclaim_pt); + /* Or, we zap COWed pages only if the caller wants to */ - return details->even_cows; + return !details->skip_cows; } /* Decides whether we should zap this folio with the folio pointer specified */ @@ -2149,8 +2151,6 @@ void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap) struct mmu_notifier_range range; struct zap_details details = { .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP, - /* Careful - we need to zap private pages too! */ - .even_cows = true, }; vma = unmap->first; @@ -4282,7 +4282,7 @@ void unmap_mapping_folio(struct folio *folio) first_index = folio->index; last_index = folio_next_index(folio) - 1; - details.even_cows = false; + details.skip_cows = true; details.single_folio = folio; details.zap_flags = ZAP_FLAG_DROP_MARKER; @@ -4312,7 +4312,7 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t first_index = start; pgoff_t last_index = start + nr - 1; - details.even_cows = even_cows; + details.skip_cows = !even_cows; if (last_index < first_index) last_index = ULONG_MAX; -- cgit v1.2.3 From 5f10cbbddc2bd80a5944f1c783830f7ebf648ad2 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:41 +0100 Subject: mm/memory: use __zap_vma_range() in zap_vma_for_reaping() Let's call __zap_vma_range() instead of unmap_page_range() to prepare for further cleanups. To keep the existing behavior, whereby we do not call uprobe_munmap() which could block, add a new "reaping" member to zap_details and use it. Likely we should handle the possible blocking in uprobe_munmap() differently, but for now keep it unchanged. Link: https://lkml.kernel.org/r/20260227200848.114019-11-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 + mm/memory.c | 13 +++++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index cb4f5fbccaf0..488a144c9161 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2769,6 +2769,7 @@ struct zap_details { struct folio *single_folio; /* Locked folio to be unmapped */ bool skip_cows; /* Do not zap COWed private pages */ bool reclaim_pt; /* Need reclaim page tables? */ + bool reaping; /* Reaping, do not block. */ zap_flags_t zap_flags; /* Extra flags for zapping */ }; diff --git a/mm/memory.c b/mm/memory.c index c66b7b8b47eb..d1fd3cdd677a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2079,14 +2079,18 @@ static void __zap_vma_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct zap_details *details) { + const bool reaping = details && details->reaping; + VM_WARN_ON_ONCE(start >= end || !range_in_vma(vma, start, end)); - if (vma->vm_file) + /* uprobe_munmap() might sleep, so skip it when reaping. */ + if (vma->vm_file && !reaping) uprobe_munmap(vma, start, end); if (unlikely(is_vm_hugetlb_page(vma))) { zap_flags_t zap_flags = details ? details->zap_flags : 0; + VM_WARN_ON_ONCE(reaping); /* * vm_file will be NULL when we fail early while instantiating * a new mapping. In this case, no pages were mapped yet and @@ -2111,11 +2115,12 @@ static void __zap_vma_range(struct mmu_gather *tlb, struct vm_area_struct *vma, */ int zap_vma_for_reaping(struct vm_area_struct *vma) { + struct zap_details details = { + .reaping = true, + }; struct mmu_notifier_range range; struct mmu_gather tlb; - VM_WARN_ON_ONCE(is_vm_hugetlb_page(vma)); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, vma->vm_start, vma->vm_end); tlb_gather_mmu(&tlb, vma->vm_mm); @@ -2123,7 +2128,7 @@ int zap_vma_for_reaping(struct vm_area_struct *vma) tlb_finish_mmu(&tlb); return -EBUSY; } - unmap_page_range(&tlb, vma, range.start, range.end, NULL); + __zap_vma_range(&tlb, vma, range.start, range.end, &details); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); return 0; -- cgit v1.2.3 From 32bc7fe4a6f4d359b6de96cbc106d2cac695154e Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:43 +0100 Subject: mm: rename zap_vma_pages() to zap_vma() Let's rename it to an even simpler name. While at it, add some simplistic kernel doc. Link: https://lkml.kernel.org/r/20260227200848.114019-13-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- arch/powerpc/platforms/book3s/vas-api.c | 2 +- arch/powerpc/platforms/pseries/vas.c | 2 +- include/linux/mm.h | 6 +++++- lib/vdso/datastore.c | 2 +- mm/page-writeback.c | 2 +- 5 files changed, 9 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/arch/powerpc/platforms/book3s/vas-api.c b/arch/powerpc/platforms/book3s/vas-api.c index ea4ffa63f043..e96d79db69fe 100644 --- a/arch/powerpc/platforms/book3s/vas-api.c +++ b/arch/powerpc/platforms/book3s/vas-api.c @@ -414,7 +414,7 @@ static vm_fault_t vas_mmap_fault(struct vm_fault *vmf) /* * When the LPAR lost credits due to core removal or during * migration, invalidate the existing mapping for the current - * paste addresses and set windows in-active (zap_vma_pages in + * paste addresses and set windows in-active (zap_vma() in * reconfig_close_windows()). * New mapping will be done later after migration or new credits * available. So continue to receive faults if the user space diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c index ceb0a8788c0a..fa05f04364fe 100644 --- a/arch/powerpc/platforms/pseries/vas.c +++ b/arch/powerpc/platforms/pseries/vas.c @@ -807,7 +807,7 @@ static int reconfig_close_windows(struct vas_caps *vcap, int excess_creds, * is done before the original mmap() and after the ioctl. */ if (vma) - zap_vma_pages(vma); + zap_vma(vma); mutex_unlock(&task_ref->mmap_mutex); mmap_write_unlock(task_ref->mm); diff --git a/include/linux/mm.h b/include/linux/mm.h index 488a144c9161..60c13d40c65c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2806,7 +2806,11 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, unsigned long size); -static inline void zap_vma_pages(struct vm_area_struct *vma) +/** + * zap_vma - zap all page table entries in a vma + * @vma: The vma to zap. + */ +static inline void zap_vma(struct vm_area_struct *vma) { zap_page_range_single(vma, vma->vm_start, vma->vm_end - vma->vm_start); } diff --git a/lib/vdso/datastore.c b/lib/vdso/datastore.c index a565c30c71a0..222c143aebf7 100644 --- a/lib/vdso/datastore.c +++ b/lib/vdso/datastore.c @@ -121,7 +121,7 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) mmap_read_lock(mm); for_each_vma(vmi, vma) { if (vma_is_special_mapping(vma, &vdso_vvar_mapping)) - zap_vma_pages(vma); + zap_vma(vma); } mmap_read_unlock(mm); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 1009bb042ba4..8dc47b59ca18 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2645,7 +2645,7 @@ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) * while this function is in progress, although it may have been truncated * before this function is called. Most callers have the folio locked. * A few have the folio blocked from truncation through other means (e.g. - * zap_vma_pages() has it mapped and is holding the page table lock). + * zap_vma() has it mapped and is holding the page table lock). * When called from mark_buffer_dirty(), the filesystem should hold a * reference to the buffer_head that is being marked dirty, which causes * try_to_free_buffers() to fail. -- cgit v1.2.3 From 0326440c3545c86b6501c7c636fcf018d6e87b8c Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:45 +0100 Subject: mm: rename zap_page_range_single() to zap_vma_range() Let's rename it to make it better match our new naming scheme. While at it, polish the kerneldoc. [akpm@linux-foundation.org: fix rustfmtcheck] Link: https://lkml.kernel.org/r/20260227200848.114019-15-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: Puranjay Mohan Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- arch/s390/mm/gmap_helpers.c | 2 +- drivers/android/binder/page_range.rs | 4 ++-- drivers/android/binder_alloc.c | 2 +- include/linux/mm.h | 4 ++-- kernel/bpf/arena.c | 2 +- kernel/events/core.c | 2 +- mm/madvise.c | 4 ++-- mm/memory.c | 14 +++++++------- net/ipv4/tcp.c | 6 +++--- rust/kernel/mm/virt.rs | 4 ++-- 10 files changed, 22 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c index ae2d59a19313..f8789ffcc05c 100644 --- a/arch/s390/mm/gmap_helpers.c +++ b/arch/s390/mm/gmap_helpers.c @@ -89,7 +89,7 @@ void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned lo if (!vma) return; if (!is_vm_hugetlb_page(vma)) - zap_page_range_single(vma, vmaddr, min(end, vma->vm_end) - vmaddr); + zap_vma_range(vma, vmaddr, min(end, vma->vm_end) - vmaddr); vmaddr = vma->vm_end; } } diff --git a/drivers/android/binder/page_range.rs b/drivers/android/binder/page_range.rs index 9dfc154e5dd4..8882fd18d9f3 100644 --- a/drivers/android/binder/page_range.rs +++ b/drivers/android/binder/page_range.rs @@ -130,7 +130,7 @@ pub(crate) struct ShrinkablePageRange { pid: Pid, /// The mm for the relevant process. mm: ARef, - /// Used to synchronize calls to `vm_insert_page` and `zap_page_range_single`. + /// Used to synchronize calls to `vm_insert_page` and `zap_vma_range`. #[pin] mm_lock: Mutex<()>, /// Spinlock protecting changes to pages. @@ -762,7 +762,7 @@ unsafe extern "C" fn rust_shrink_free_page( if let Some(unchecked_vma) = mmap_read.vma_lookup(vma_addr) { if let Some(vma) = check_vma(unchecked_vma, range_ptr) { let user_page_addr = vma_addr + (page_index << PAGE_SHIFT); - vma.zap_page_range_single(user_page_addr, PAGE_SIZE); + vma.zap_vma_range(user_page_addr, PAGE_SIZE); } } diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index dd2046bd5cde..e4488ad86a65 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -1185,7 +1185,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item, if (vma) { trace_binder_unmap_user_start(alloc, index); - zap_page_range_single(vma, page_addr, PAGE_SIZE); + zap_vma_range(vma, page_addr, PAGE_SIZE); trace_binder_unmap_user_end(alloc, index); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 60c13d40c65c..10a5b9ba4eeb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2804,7 +2804,7 @@ struct page *vm_normal_page_pud(struct vm_area_struct *vma, unsigned long addr, void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); -void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, +void zap_vma_range(struct vm_area_struct *vma, unsigned long address, unsigned long size); /** * zap_vma - zap all page table entries in a vma @@ -2812,7 +2812,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, */ static inline void zap_vma(struct vm_area_struct *vma) { - zap_page_range_single(vma, vma->vm_start, vma->vm_end - vma->vm_start); + zap_vma_range(vma, vma->vm_start, vma->vm_end - vma->vm_start); } struct mmu_notifier_range; diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 19cca936eb9d..08d008cc471e 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -656,7 +656,7 @@ static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt) guard(mutex)(&arena->lock); /* iterate link list under lock */ list_for_each_entry(vml, &arena->vma_list, head) - zap_page_range_single(vml->vma, uaddr, PAGE_SIZE * page_cnt); + zap_vma_range(vml->vma, uaddr, PAGE_SIZE * page_cnt); } static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable) diff --git a/kernel/events/core.c b/kernel/events/core.c index 2ecdaabf1b4d..d5b21077e829 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7213,7 +7213,7 @@ static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma) #ifdef CONFIG_MMU /* Clear any partial mappings on error. */ if (err) - zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE); + zap_vma_range(vma, vma->vm_start, nr_pages * PAGE_SIZE); #endif return err; diff --git a/mm/madvise.c b/mm/madvise.c index a50ec5f90e3e..afe0f01765c4 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -832,7 +832,7 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior) * Application no longer needs these pages. If the pages are dirty, * it's OK to just throw them away. The app will be more careful about * data it wants to keep. Be sure to free swap resources too. The - * zap_page_range_single call sets things up for shrink_active_list to actually + * zap_vma_range call sets things up for shrink_active_list to actually * free these pages later if no one else has touched them in the meantime, * although we could add these pages to a global reuse list for * shrink_active_list to pick up before reclaiming other pages. @@ -1191,7 +1191,7 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior) * OK some of the range have non-guard pages mapped, zap * them. This leaves existing guard pages in place. */ - zap_page_range_single(vma, range->start, range->end - range->start); + zap_vma_range(vma, range->start, range->end - range->start); } /* diff --git a/mm/memory.c b/mm/memory.c index 879858e466ef..dd80fbf6473a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2215,14 +2215,14 @@ void zap_vma_range_batched(struct mmu_gather *tlb, } /** - * zap_page_range_single - remove user pages in a given range - * @vma: vm_area_struct holding the applicable pages - * @address: starting address of pages to zap + * zap_vma_range - zap all page table entries in a vma range + * @vma: the vma covering the range to zap + * @address: starting address of the range to zap * @size: number of bytes to zap * - * The range must fit into one VMA. + * The provided address range must be fully contained within @vma. */ -void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, +void zap_vma_range(struct vm_area_struct *vma, unsigned long address, unsigned long size) { struct mmu_gather tlb; @@ -2250,7 +2250,7 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, !(vma->vm_flags & VM_PFNMAP)) return; - zap_page_range_single(vma, address, size); + zap_vma_range(vma, address, size); } EXPORT_SYMBOL_GPL(zap_vma_ptes); @@ -3018,7 +3018,7 @@ static int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long add * maintain page reference counts, and callers may free * pages due to the error. So zap it early. */ - zap_page_range_single(vma, addr, size); + zap_vma_range(vma, addr, size); return error; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 89c962672e51..9573ce9b0ac1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2105,7 +2105,7 @@ static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, maybe_zap_len = total_bytes_to_map - /* All bytes to map */ *length + /* Mapped or pending */ (pages_remaining * PAGE_SIZE); /* Failed map. */ - zap_page_range_single(vma, *address, maybe_zap_len); + zap_vma_range(vma, *address, maybe_zap_len); err = 0; } @@ -2113,7 +2113,7 @@ static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, unsigned long leftover_pages = pages_remaining; int bytes_mapped; - /* We called zap_page_range_single, try to reinsert. */ + /* We called zap_vma_range, try to reinsert. */ err = vm_insert_pages(vma, *address, pending_pages, &pages_remaining); @@ -2270,7 +2270,7 @@ static int tcp_zerocopy_receive(struct sock *sk, total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1); if (total_bytes_to_map) { if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT)) - zap_page_range_single(vma, address, total_bytes_to_map); + zap_vma_range(vma, address, total_bytes_to_map); zc->length = total_bytes_to_map; zc->recv_skip_hint = 0; } else { diff --git a/rust/kernel/mm/virt.rs b/rust/kernel/mm/virt.rs index 6bfd91cfa1f4..63eb730b0b05 100644 --- a/rust/kernel/mm/virt.rs +++ b/rust/kernel/mm/virt.rs @@ -113,7 +113,7 @@ impl VmaRef { /// kernel goes further in freeing unused page tables, but for the purposes of this operation /// we must only assume that the leaf level is cleared. #[inline] - pub fn zap_page_range_single(&self, address: usize, size: usize) { + pub fn zap_vma_range(&self, address: usize, size: usize) { let (end, did_overflow) = address.overflowing_add(size); if did_overflow || address < self.start() || self.end() < end { // TODO: call WARN_ONCE once Rust version of it is added @@ -123,7 +123,7 @@ impl VmaRef { // SAFETY: By the type invariants, the caller has read access to this VMA, which is // sufficient for this method call. This method has no requirements on the vma flags. The // address range is checked to be within the vma. - unsafe { bindings::zap_page_range_single(self.as_ptr(), address, size) }; + unsafe { bindings::zap_vma_range(self.as_ptr(), address, size) }; } /// If the [`VM_MIXEDMAP`] flag is set, returns a [`VmaMixedMap`] to this VMA, otherwise -- cgit v1.2.3 From 52a9e9cd181fab8b03cf4e982533224697669976 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:46 +0100 Subject: mm: rename zap_vma_ptes() to zap_special_vma_range() zap_vma_ptes() is the only zapping function we export to modules. It's essentially a wrapper around zap_vma_range(), however, with some safety checks: * That the passed range fits fully into the VMA * That it's only used for VM_PFNMAP We will add support for VM_MIXEDMAP next, so use the more-generic term "special vma", although "special" is a bit overloaded. Maybe we'll later just support any VM_SPECIAL flag. While at it, improve the kerneldoc. Link: https://lkml.kernel.org/r/20260227200848.114019-16-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Acked-by: Leon Romanovsky [drivers/infiniband] Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- arch/x86/kernel/cpu/sgx/encl.c | 2 +- drivers/comedi/comedi_fops.c | 2 +- drivers/gpu/drm/i915/i915_mm.c | 4 ++-- drivers/infiniband/core/uverbs_main.c | 6 +++--- drivers/misc/sgi-gru/grumain.c | 2 +- include/linux/mm.h | 2 +- mm/memory.c | 16 +++++++--------- 7 files changed, 16 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index ac60ebde5d9b..3f0222d10f6e 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -1220,7 +1220,7 @@ void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr) ret = sgx_encl_find(encl_mm->mm, addr, &vma); if (!ret && encl == vma->vm_private_data) - zap_vma_ptes(vma, addr, PAGE_SIZE); + zap_special_vma_range(vma, addr, PAGE_SIZE); mmap_read_unlock(encl_mm->mm); diff --git a/drivers/comedi/comedi_fops.c b/drivers/comedi/comedi_fops.c index 48a8a607a84c..b91e0b5ac394 100644 --- a/drivers/comedi/comedi_fops.c +++ b/drivers/comedi/comedi_fops.c @@ -2588,7 +2588,7 @@ static int comedi_mmap(struct file *file, struct vm_area_struct *vma) * remap_pfn_range() because we call remap_pfn_range() in a loop. */ if (retval) - zap_vma_ptes(vma, vma->vm_start, size); + zap_special_vma_range(vma, vma->vm_start, size); #endif if (retval == 0) { diff --git a/drivers/gpu/drm/i915/i915_mm.c b/drivers/gpu/drm/i915/i915_mm.c index c33bd3d83069..fd89e7c7d8d6 100644 --- a/drivers/gpu/drm/i915/i915_mm.c +++ b/drivers/gpu/drm/i915/i915_mm.c @@ -108,7 +108,7 @@ int remap_io_mapping(struct vm_area_struct *vma, err = apply_to_page_range(r.mm, addr, size, remap_pfn, &r); if (unlikely(err)) { - zap_vma_ptes(vma, addr, (r.pfn - pfn) << PAGE_SHIFT); + zap_special_vma_range(vma, addr, (r.pfn - pfn) << PAGE_SHIFT); return err; } @@ -156,7 +156,7 @@ int remap_io_sg(struct vm_area_struct *vma, err = apply_to_page_range(r.mm, addr, size, remap_sg, &r); if (unlikely(err)) { - zap_vma_ptes(vma, addr, r.pfn << PAGE_SHIFT); + zap_special_vma_range(vma, addr, r.pfn << PAGE_SHIFT); return err; } diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 7b68967a6301..f5837da47299 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -756,7 +756,7 @@ out_zap: * point, so zap it. */ vma->vm_private_data = NULL; - zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); + zap_special_vma_range(vma, vma->vm_start, vma->vm_end - vma->vm_start); } static void rdma_umap_close(struct vm_area_struct *vma) @@ -782,7 +782,7 @@ static void rdma_umap_close(struct vm_area_struct *vma) } /* - * Once the zap_vma_ptes has been called touches to the VMA will come here and + * Once the zap_special_vma_range has been called touches to the VMA will come here and * we return a dummy writable zero page for all the pfns. */ static vm_fault_t rdma_umap_fault(struct vm_fault *vmf) @@ -878,7 +878,7 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) continue; list_del_init(&priv->list); - zap_vma_ptes(vma, vma->vm_start, + zap_special_vma_range(vma, vma->vm_start, vma->vm_end - vma->vm_start); if (priv->entry) { diff --git a/drivers/misc/sgi-gru/grumain.c b/drivers/misc/sgi-gru/grumain.c index 8d749f345246..278b76cbd281 100644 --- a/drivers/misc/sgi-gru/grumain.c +++ b/drivers/misc/sgi-gru/grumain.c @@ -542,7 +542,7 @@ void gru_unload_context(struct gru_thread_state *gts, int savestate) int ctxnum = gts->ts_ctxnum; if (!is_kernel_context(gts)) - zap_vma_ptes(gts->ts_vma, UGRUADDR(gts), GRU_GSEG_PAGESIZE); + zap_special_vma_range(gts->ts_vma, UGRUADDR(gts), GRU_GSEG_PAGESIZE); cch = get_cch(gru->gs_gru_base_vaddr, ctxnum); gru_dbg(grudev, "gts %p, cbrmap 0x%lx, dsrmap 0x%lx\n", diff --git a/include/linux/mm.h b/include/linux/mm.h index 10a5b9ba4eeb..c516d5177211 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2802,7 +2802,7 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, struct page *vm_normal_page_pud(struct vm_area_struct *vma, unsigned long addr, pud_t pud); -void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, +void zap_special_vma_range(struct vm_area_struct *vma, unsigned long address, unsigned long size); void zap_vma_range(struct vm_area_struct *vma, unsigned long address, unsigned long size); diff --git a/mm/memory.c b/mm/memory.c index dd80fbf6473a..3dc4664c9af7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2233,17 +2233,15 @@ void zap_vma_range(struct vm_area_struct *vma, unsigned long address, } /** - * zap_vma_ptes - remove ptes mapping the vma - * @vma: vm_area_struct holding ptes to be zapped - * @address: starting address of pages to zap + * zap_special_vma_range - zap all page table entries in a special vma range + * @vma: the vma covering the range to zap + * @address: starting address of the range to zap * @size: number of bytes to zap * - * This function only unmaps ptes assigned to VM_PFNMAP vmas. - * - * The entire address range must be fully contained within the vma. - * + * This function does nothing when the provided address range is not fully + * contained in @vma, or when the @vma is not VM_PFNMAP. */ -void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, +void zap_special_vma_range(struct vm_area_struct *vma, unsigned long address, unsigned long size) { if (!range_in_vma(vma, address, address + size) || @@ -2252,7 +2250,7 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, zap_vma_range(vma, address, size); } -EXPORT_SYMBOL_GPL(zap_vma_ptes); +EXPORT_SYMBOL_GPL(zap_special_vma_range); static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr) { -- cgit v1.2.3 From 5a970006786a3b10577e762a9a6c0b9353b4e8a4 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 6 Mar 2026 14:43:37 +0800 Subject: mm: use inline helper functions instead of ugly macros Patch series "support batched checking of the young flag for MGLRU", v3. This is a follow-up to the previous work [1], to support batched checking of the young flag for MGLRU. Similarly, batched checking of young flag for large folios can improve performance during large-folio reclamation when MGLRU is enabled. I observed noticeable performance improvements (see patch 5) on an Arm64 machine that supports contiguous PTEs. All mm-selftests are passed. Patch 1 - 3: cleanup patches. Patch 4: add a new generic batched PTE helper: test_and_clear_young_ptes(). Patch 5: support batched young flag checking for MGLRU. Patch 6: implement the Arm64 arch-specific test_and_clear_young_ptes(). This patch (of 6): People have already complained that these *_clear_young_notify() related macros are very ugly, so let's use inline helpers to make them more readable. In addition, we cannot implement these inline helper functions in the mmu_notifier.h file, because some arch-specific files will include the mmu_notifier.h, which introduces header compilation dependencies and causes build errors (e.g., arch/arm64/include/asm/tlbflush.h). Moreover, since these functions are only used in the mm, implementing these inline helpers in the mm/internal.h header seems reasonable. Link: https://lkml.kernel.org/r/cover.1772778858.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/ea14af84e7967ccebb25082c28a8669d6da8fe57.1772778858.git.baolin.wang@linux.alibaba.com Link: https://lore.kernel.org/all/cover.1770645603.git.baolin.wang@linux.alibaba.com/ [1] Signed-off-by: Baolin Wang Reviewed-by: Rik van Riel Reviewed-by: Barry Song Acked-by: David Hildenbrand (Arm) Cc: Axel Rasmussen Cc: Catalin Marinas Cc: Dev Jain Cc: Jann Horn Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Qi Zheng Cc: Ryan Roberts Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Will Deacon Cc: Yuanchu Xie Cc: Alistair Popple Signed-off-by: Andrew Morton --- include/linux/mmu_notifier.h | 54 -------------------------------------------- mm/internal.h | 52 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 54 deletions(-) (limited to 'include') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 8450e18a87c2..3705d350c863 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -516,55 +516,6 @@ static inline void mmu_notifier_range_init_owner( range->owner = owner; } -#define clear_flush_young_ptes_notify(__vma, __address, __ptep, __nr) \ -({ \ - int __young; \ - struct vm_area_struct *___vma = __vma; \ - unsigned long ___address = __address; \ - unsigned int ___nr = __nr; \ - __young = clear_flush_young_ptes(___vma, ___address, __ptep, ___nr); \ - __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ - ___address, \ - ___address + \ - ___nr * PAGE_SIZE); \ - __young; \ -}) - -#define pmdp_clear_flush_young_notify(__vma, __address, __pmdp) \ -({ \ - int __young; \ - struct vm_area_struct *___vma = __vma; \ - unsigned long ___address = __address; \ - __young = pmdp_clear_flush_young(___vma, ___address, __pmdp); \ - __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ - ___address, \ - ___address + \ - PMD_SIZE); \ - __young; \ -}) - -#define ptep_clear_young_notify(__vma, __address, __ptep) \ -({ \ - int __young; \ - struct vm_area_struct *___vma = __vma; \ - unsigned long ___address = __address; \ - __young = ptep_test_and_clear_young(___vma, ___address, __ptep);\ - __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \ - ___address + PAGE_SIZE); \ - __young; \ -}) - -#define pmdp_clear_young_notify(__vma, __address, __pmdp) \ -({ \ - int __young; \ - struct vm_area_struct *___vma = __vma; \ - unsigned long ___address = __address; \ - __young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\ - __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \ - ___address + PMD_SIZE); \ - __young; \ -}) - #else /* CONFIG_MMU_NOTIFIER */ struct mmu_notifier_range { @@ -652,11 +603,6 @@ static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) #define mmu_notifier_range_update_to_read_only(r) false -#define clear_flush_young_ptes_notify clear_flush_young_ptes -#define pmdp_clear_flush_young_notify pmdp_clear_flush_young -#define ptep_clear_young_notify ptep_test_and_clear_young -#define pmdp_clear_young_notify pmdp_test_and_clear_young - static inline void mmu_notifier_synchronize(void) { } diff --git a/mm/internal.h b/mm/internal.h index 6e1162e13289..321b8019de9f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -1796,4 +1797,55 @@ static inline int io_remap_pfn_range_complete(struct vm_area_struct *vma, return remap_pfn_range_complete(vma, addr, pfn, size, prot); } +#ifdef CONFIG_MMU_NOTIFIER +static inline int clear_flush_young_ptes_notify(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr) +{ + int young; + + young = clear_flush_young_ptes(vma, addr, ptep, nr); + young |= mmu_notifier_clear_flush_young(vma->vm_mm, addr, + addr + nr * PAGE_SIZE); + return young; +} + +static inline int pmdp_clear_flush_young_notify(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + int young; + + young = pmdp_clear_flush_young(vma, addr, pmdp); + young |= mmu_notifier_clear_flush_young(vma->vm_mm, addr, addr + PMD_SIZE); + return young; +} + +static inline int ptep_clear_young_notify(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + int young; + + young = ptep_test_and_clear_young(vma, addr, ptep); + young |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + PAGE_SIZE); + return young; +} + +static inline int pmdp_clear_young_notify(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + int young; + + young = pmdp_test_and_clear_young(vma, addr, pmdp); + young |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + PMD_SIZE); + return young; +} + +#else /* CONFIG_MMU_NOTIFIER */ + +#define clear_flush_young_ptes_notify clear_flush_young_ptes +#define pmdp_clear_flush_young_notify pmdp_clear_flush_young +#define ptep_clear_young_notify ptep_test_and_clear_young +#define pmdp_clear_young_notify pmdp_test_and_clear_young + +#endif /* CONFIG_MMU_NOTIFIER */ + #endif /* __MM_INTERNAL_H */ -- cgit v1.2.3 From 6d7237dda44f24bb0dec5dbd2a0ed6be77bf6ef6 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 6 Mar 2026 14:43:40 +0800 Subject: mm: add a batched helper to clear the young flag for large folios Currently, MGLRU will call ptep_test_and_clear_young_notify() to check and clear the young flag for each PTE sequentially, which is inefficient for large folios reclamation. Moreover, on Arm64 architecture, which supports contiguous PTEs, the Arm64- specific ptep_test_and_clear_young() already implements an optimization to clear the young flags for PTEs within a contiguous range. However, this is not sufficient. Similar to the Arm64 specific clear_flush_young_ptes(), we can extend this to perform batched operations for the entire large folio (which might exceed the contiguous range: CONT_PTE_SIZE). Thus, we can introduce a new batched helper: test_and_clear_young_ptes() and its wrapper test_and_clear_young_ptes_notify() which are consistent with the existing functions, to perform batched checking of the young flags for large folios, which can help improve performance during large folio reclamation when MGLRU is enabled. And it will be overridden by the architecture that implements a more efficient batch operation in the following patches. Link: https://lkml.kernel.org/r/23ec671bfcc06cd24ee0fbff8e329402742274a0.1772778858.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Andrew Morton Cc: Alistair Popple Cc: Axel Rasmussen Cc: Barry Song Cc: Catalin Marinas Cc: David Hildenbrand (Arm) Cc: Dev Jain Cc: Jann Horn Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Qi Zheng Cc: Rik van Riel Cc: Ryan Roberts Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Will Deacon Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 37 +++++++++++++++++++++++++++++++++++++ mm/internal.h | 16 +++++++++++----- 2 files changed, 48 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index d2767a4c027b..17d961c612fc 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1103,6 +1103,43 @@ static inline int clear_flush_young_ptes(struct vm_area_struct *vma, } #endif +#ifndef test_and_clear_young_ptes +/** + * test_and_clear_young_ptes - Mark PTEs that map consecutive pages of the same + * folio as old + * @vma: The virtual memory area the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to clear access bit. + * + * May be overridden by the architecture; otherwise, implemented as a simple + * loop over ptep_test_and_clear_young(). + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + * + * Returns: whether any PTE was young. + */ +static inline int test_and_clear_young_ptes(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr) +{ + int young = 0; + + for (;;) { + young |= ptep_test_and_clear_young(vma, addr, ptep); + if (--nr == 0) + break; + ptep++; + addr += PAGE_SIZE; + } + + return young; +} +#endif + /* * On some architectures hardware does not set page access bit when accessing * memory page, it is responsibility of software setting this bit. It brings diff --git a/mm/internal.h b/mm/internal.h index 1b718fdb074e..1357dc04f065 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1819,13 +1819,13 @@ static inline int pmdp_clear_flush_young_notify(struct vm_area_struct *vma, return young; } -static inline int ptep_test_and_clear_young_notify(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline int test_and_clear_young_ptes_notify(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr) { int young; - young = ptep_test_and_clear_young(vma, addr, ptep); - young |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + PAGE_SIZE); + young = test_and_clear_young_ptes(vma, addr, ptep, nr); + young |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + nr * PAGE_SIZE); return young; } @@ -1843,9 +1843,15 @@ static inline int pmdp_test_and_clear_young_notify(struct vm_area_struct *vma, #define clear_flush_young_ptes_notify clear_flush_young_ptes #define pmdp_clear_flush_young_notify pmdp_clear_flush_young -#define ptep_test_and_clear_young_notify ptep_test_and_clear_young +#define test_and_clear_young_ptes_notify test_and_clear_young_ptes #define pmdp_test_and_clear_young_notify pmdp_test_and_clear_young #endif /* CONFIG_MMU_NOTIFIER */ +static inline int ptep_test_and_clear_young_notify(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + return test_and_clear_young_ptes_notify(vma, addr, ptep, 1); +} + #endif /* __MM_INTERNAL_H */ -- cgit v1.2.3 From 56e5b60b2114dee967c971f08dd29ef193bd3a2d Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 6 Mar 2026 14:43:41 +0800 Subject: mm: support batched checking of the young flag for MGLRU Use the batched helper test_and_clear_young_ptes_notify() to check and clear the young flag to improve the performance during large folio reclamation when MGLRU is enabled. Meanwhile, we can also support batched checking the young and dirty flag when MGLRU walks the mm's pagetable to update the folios' generation counter. Since MGLRU also checks the PTE dirty bit, use folio_pte_batch_flags() with FPB_MERGE_YOUNG_DIRTY set to detect batches of PTEs for a large folio. Then we can remove the ptep_test_and_clear_young_notify() since it has no users now. Note that we also update the 'young' counter and 'mm_stats[MM_LEAF_YOUNG]' counter with the batched count in the lru_gen_look_around() and walk_pte_range(). However, the batched operations may inflate these two counters, because in a large folio not all PTEs may have been accessed. (Additionally, tracking how many PTEs have been accessed within a large folio is not very meaningful, since the mm core actually tracks access/dirty on a per-folio basis, not per page). The impact analysis is as follows: 1. The 'mm_stats[MM_LEAF_YOUNG]' counter has no functional impact and is mainly for debugging. 2. The 'young' counter is used to decide whether to place the current PMD entry into the bloom filters by suitable_to_scan() (so that next time we can check whether it has been accessed again), which may set the hash bit in the bloom filters for a PMD entry that hasn't seen much access. However, bloom filters inherently allow some error, so this effect appears negligible. Link: https://lkml.kernel.org/r/378f4acf7d07410aa7c2e4b49d56bb165918eb34.1772778858.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Rik van Riel Acked-by: David Hildenbrand (Arm) Cc: Alistair Popple Cc: Axel Rasmussen Cc: Barry Song Cc: Catalin Marinas Cc: Dev Jain Cc: Jann Horn Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Qi Zheng Cc: Ryan Roberts Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Will Deacon Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 5 +++-- mm/internal.h | 6 ------ mm/rmap.c | 28 ++++++++++++++-------------- mm/vmscan.c | 43 ++++++++++++++++++++++++++++++++----------- 4 files changed, 49 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5c3ae0348754..3f651baf7e2b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -684,7 +684,7 @@ struct lru_gen_memcg { void lru_gen_init_pgdat(struct pglist_data *pgdat); void lru_gen_init_lruvec(struct lruvec *lruvec); -bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw); +bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr); void lru_gen_init_memcg(struct mem_cgroup *memcg); void lru_gen_exit_memcg(struct mem_cgroup *memcg); @@ -703,7 +703,8 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec) { } -static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) +static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, + unsigned int nr) { return false; } diff --git a/mm/internal.h b/mm/internal.h index 1357dc04f065..4ab833b8bcdf 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1848,10 +1848,4 @@ static inline int pmdp_test_and_clear_young_notify(struct vm_area_struct *vma, #endif /* CONFIG_MMU_NOTIFIER */ -static inline int ptep_test_and_clear_young_notify(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) -{ - return test_and_clear_young_ptes_notify(vma, addr, ptep, 1); -} - #endif /* __MM_INTERNAL_H */ diff --git a/mm/rmap.c b/mm/rmap.c index cd48f34f11b5..abe4712a220c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -965,25 +965,20 @@ static bool folio_referenced_one(struct folio *folio, return false; } + if (pvmw.pte && folio_test_large(folio)) { + const unsigned long end_addr = pmd_addr_end(address, vma->vm_end); + const unsigned int max_nr = (end_addr - address) >> PAGE_SHIFT; + pte_t pteval = ptep_get(pvmw.pte); + + nr = folio_pte_batch(folio, pvmw.pte, pteval, max_nr); + } + if (lru_gen_enabled() && pvmw.pte) { - if (lru_gen_look_around(&pvmw)) + if (lru_gen_look_around(&pvmw, nr)) referenced++; } else if (pvmw.pte) { - if (folio_test_large(folio)) { - unsigned long end_addr = pmd_addr_end(address, vma->vm_end); - unsigned int max_nr = (end_addr - address) >> PAGE_SHIFT; - pte_t pteval = ptep_get(pvmw.pte); - - nr = folio_pte_batch(folio, pvmw.pte, - pteval, max_nr); - } - - ptes += nr; if (clear_flush_young_ptes_notify(vma, address, pvmw.pte, nr)) referenced++; - /* Skip the batched PTEs */ - pvmw.pte += nr - 1; - pvmw.address += (nr - 1) * PAGE_SIZE; } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { if (pmdp_clear_flush_young_notify(vma, address, pvmw.pmd)) @@ -993,6 +988,7 @@ static bool folio_referenced_one(struct folio *folio, WARN_ON_ONCE(1); } + ptes += nr; pra->mapcount -= nr; /* * If we are sure that we batched the entire folio, @@ -1002,6 +998,10 @@ static bool folio_referenced_one(struct folio *folio, page_vma_mapped_walk_done(&pvmw); break; } + + /* Skip the batched PTEs */ + pvmw.pte += nr - 1; + pvmw.address += (nr - 1) * PAGE_SIZE; } if (referenced) diff --git a/mm/vmscan.c b/mm/vmscan.c index 7ab9e1cdccd2..3a4a0a81c871 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3499,6 +3499,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); DEFINE_MAX_SEQ(walk->lruvec); int gen = lru_gen_from_seq(max_seq); + unsigned int nr; pmd_t pmdval; pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, &ptl); @@ -3517,11 +3518,13 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, lazy_mmu_mode_enable(); restart: - for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { + for (i = pte_index(start), addr = start; addr != end; i += nr, addr += nr * PAGE_SIZE) { unsigned long pfn; struct folio *folio; - pte_t ptent = ptep_get(pte + i); + pte_t *cur_pte = pte + i; + pte_t ptent = ptep_get(cur_pte); + nr = 1; total++; walk->mm_stats[MM_LEAF_TOTAL]++; @@ -3533,7 +3536,16 @@ restart: if (!folio) continue; - if (!ptep_test_and_clear_young_notify(args->vma, addr, pte + i)) + if (folio_test_large(folio)) { + const unsigned int max_nr = (end - addr) >> PAGE_SHIFT; + + nr = folio_pte_batch_flags(folio, NULL, cur_pte, &ptent, + max_nr, FPB_MERGE_YOUNG_DIRTY); + total += nr - 1; + walk->mm_stats[MM_LEAF_TOTAL] += nr - 1; + } + + if (!test_and_clear_young_ptes_notify(args->vma, addr, cur_pte, nr)) continue; if (last != folio) { @@ -3546,8 +3558,8 @@ restart: if (pte_dirty(ptent)) dirty = true; - young++; - walk->mm_stats[MM_LEAF_YOUNG]++; + young += nr; + walk->mm_stats[MM_LEAF_YOUNG] += nr; } walk_update_folio(walk, last, gen, dirty); @@ -4191,7 +4203,7 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) * the PTE table to the Bloom filter. This forms a feedback loop between the * eviction and the aging. */ -bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) +bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr) { int i; bool dirty; @@ -4214,7 +4226,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) lockdep_assert_held(pvmw->ptl); VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); - if (!ptep_test_and_clear_young_notify(vma, addr, pte)) + if (!test_and_clear_young_ptes_notify(vma, addr, pte, nr)) return false; if (spin_is_contended(pvmw->ptl)) @@ -4248,10 +4260,12 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) pte -= (addr - start) / PAGE_SIZE; - for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { + for (i = 0, addr = start; addr != end; + i += nr, pte += nr, addr += nr * PAGE_SIZE) { unsigned long pfn; - pte_t ptent = ptep_get(pte + i); + pte_t ptent = ptep_get(pte); + nr = 1; pfn = get_pte_pfn(ptent, vma, addr, pgdat); if (pfn == -1) continue; @@ -4260,7 +4274,14 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) if (!folio) continue; - if (!ptep_test_and_clear_young_notify(vma, addr, pte + i)) + if (folio_test_large(folio)) { + const unsigned int max_nr = (end - addr) >> PAGE_SHIFT; + + nr = folio_pte_batch_flags(folio, NULL, pte, &ptent, + max_nr, FPB_MERGE_YOUNG_DIRTY); + } + + if (!test_and_clear_young_ptes_notify(vma, addr, pte, nr)) continue; if (last != folio) { @@ -4273,7 +4294,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) if (pte_dirty(ptent)) dirty = true; - young++; + young += nr; } walk_update_folio(walk, last, gen, dirty); -- cgit v1.2.3 From 417607de1f4e6280f646aa42cad5ed84e9228c01 Mon Sep 17 00:00:00 2001 From: Yuvraj Sakshith Date: Tue, 3 Mar 2026 03:30:28 -0800 Subject: mm/page_reporting: add PAGE_REPORTING_ORDER_UNSPECIFIED MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Allow order zero pages in page reporting", v4. Today, page reporting sets page_reporting_order in two ways: (1) page_reporting.page_reporting_order cmdline parameter (2) Driver can pass order while registering itself. In both cases, order zero is ignored by free page reporting because it is used to set page_reporting_order to a default value, like MAX_PAGE_ORDER. In some cases we might want page_reporting_order to be zero. For instance, when virtio-balloon runs inside a guest with tiny memory (say, 16MB), it might not be able to find a order 1 page (or in the worst case order MAX_PAGE_ORDER page) after some uptime. Page reporting should be able to return order zero pages back for optimal memory relinquishment. This patch changes the default fallback value from '0' to '-1' in all possible clients of free page reporting (hv_balloon and virtio-balloon) together with allowing '0' as a valid order in page_reporting_register(). This patch (of 5): Drivers can pass order of pages to be reported while registering itself. Today, this is a magic number, 0. Label this with PAGE_REPORTING_ORDER_UNSPECIFIED and check for it when the driver is being registered. This macro will be used in relevant drivers next. [akpm@linux-foundation.org: tweak whitespace, per David] Link: https://lkml.kernel.org/r/20260303113032.3008371-1-yuvraj.sakshith@oss.qualcomm.com Link: https://lkml.kernel.org/r/20260303113032.3008371-2-yuvraj.sakshith@oss.qualcomm.com Signed-off-by: Yuvraj Sakshith Acked-by: David Hildenbrand (Arm) Reviewed-by: Michael Kelley Acked-by: Michael S. Tsirkin Cc: Brendan Jackman Cc: Dexuan Cui Cc: Eugenio Pérez Cc: Haiyang Zhang Cc: Jason Wang Cc: Johannes Weiner Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Liu Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/page_reporting.h | 1 + mm/page_reporting.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h index fe648dfa3a7c..d1886c657285 100644 --- a/include/linux/page_reporting.h +++ b/include/linux/page_reporting.h @@ -7,6 +7,7 @@ /* This value should always be a power of 2, see page_reporting_cycle() */ #define PAGE_REPORTING_CAPACITY 32 +#define PAGE_REPORTING_ORDER_UNSPECIFIED 0 struct page_reporting_dev_info { /* function that alters pages to make them "reported" */ diff --git a/mm/page_reporting.c b/mm/page_reporting.c index f0042d5743af..a2da5bf3a065 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -370,7 +370,8 @@ int page_reporting_register(struct page_reporting_dev_info *prdev) */ if (page_reporting_order == -1) { - if (prdev->order > 0 && prdev->order <= MAX_PAGE_ORDER) + if (prdev->order != PAGE_REPORTING_ORDER_UNSPECIFIED && + prdev->order <= MAX_PAGE_ORDER) page_reporting_order = prdev->order; else page_reporting_order = pageblock_order; -- cgit v1.2.3 From 5467c292d07ffcd55a7a66af2259855f49e1dd06 Mon Sep 17 00:00:00 2001 From: Yuvraj Sakshith Date: Tue, 3 Mar 2026 03:30:31 -0800 Subject: mm/page_reporting: change PAGE_REPORTING_ORDER_UNSPECIFIED to -1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PAGE_REPORTING_ORDER_UNSPECIFIED is now set to zero. This means, pages of order zero cannot be reported to a client/driver -- as zero is used to signal a fallback to MAX_PAGE_ORDER. Change PAGE_REPORTING_ORDER_UNSPECIFIED to (-1), so that zero can be used as a valid order with which pages can be reported. Link: https://lkml.kernel.org/r/20260303113032.3008371-5-yuvraj.sakshith@oss.qualcomm.com Signed-off-by: Yuvraj Sakshith Acked-by: David Hildenbrand (Arm) Reviewed-by: Michael Kelley Acked-by: Michael S. Tsirkin Cc: Brendan Jackman Cc: Dexuan Cui Cc: Eugenio Pérez Cc: Haiyang Zhang Cc: Jason Wang Cc: Johannes Weiner Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Liu Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/page_reporting.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h index d1886c657285..9d4ca5c218a0 100644 --- a/include/linux/page_reporting.h +++ b/include/linux/page_reporting.h @@ -7,7 +7,7 @@ /* This value should always be a power of 2, see page_reporting_cycle() */ #define PAGE_REPORTING_CAPACITY 32 -#define PAGE_REPORTING_ORDER_UNSPECIFIED 0 +#define PAGE_REPORTING_ORDER_UNSPECIFIED -1 struct page_reporting_dev_info { /* function that alters pages to make them "reported" */ -- cgit v1.2.3 From e650bb30ca532901da6def04c7d1de72ae59ea4e Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Thu, 5 Mar 2026 10:50:14 +0000 Subject: mm: rename VMA flag helpers to be more readable Patch series "mm: vma flag tweaks". The ongoing work around introducing non-system word VMA flags has introduced a number of helper functions and macros to make life easier when working with these flags and to make conversions from the legacy use of VM_xxx flags more straightforward. This series improves these to reduce confusion as to what they do and to improve consistency and readability. Firstly the series renames vma_flags_test() to vma_flags_test_any() to make it abundantly clear that this function tests whether any of the flags are set (as opposed to vma_flags_test_all()). It then renames vma_desc_test_flags() to vma_desc_test_any() for the same reason. Note that we drop the 'flags' suffix here, as vma_desc_test_any_flags() would be cumbersome and 'test' implies a flag test. Similarly, we rename vma_test_all_flags() to vma_test_all() for consistency. Next, we have a couple of instances (erofs, zonefs) where we are now testing for vma_desc_test_any(desc, VMA_SHARED_BIT) && vma_desc_test_any(desc, VMA_MAYWRITE_BIT). This is silly, so this series introduces vma_desc_test_all() so these callers can instead invoke vma_desc_test_all(desc, VMA_SHARED_BIT, VMA_MAYWRITE_BIT). We then observe that quite a few instances of vma_flags_test_any() and vma_desc_test_any() are in fact only testing against a single flag. Using the _any() variant here is just confusing - 'any' of single item reads strangely and is liable to cause confusion. So in these instances the series reintroduces vma_flags_test() and vma_desc_test() as helpers which test against a single flag. The fact that vma_flags_t is a struct and that vma_flag_t utilises sparse to avoid confusion with vm_flags_t makes it impossible for a user to misuse these helpers without it getting flagged somewhere. The series also updates __mk_vma_flags() and functions invoked by it to explicitly mark them always inline to match expectation and to be consistent with other VMA flag helpers. It also renames vma_flag_set() to vma_flags_set_flag() (a function only used by __mk_vma_flags()) to be consistent with other VMA flag helpers. Finally it updates the VMA tests for each of these changes, and introduces explicit tests for vma_flags_test() and vma_desc_test() to assert that they behave as expected. This patch (of 6): On reflection, it's confusing to have vma_flags_test() and vma_desc_test_flags() test whether any comma-separated VMA flag bit is set, while also having vma_flags_test_all() and vma_test_all_flags() separately test whether all flags are set. Firstly, rename vma_flags_test() to vma_flags_test_any() to eliminate this confusion. Secondly, since the VMA descriptor flag functions are becoming rather cumbersome, prefer vma_desc_test*() to vma_desc_test_flags*(), and also rename vma_desc_test_flags() to vma_desc_test_any(). Finally, rename vma_test_all_flags() to vma_test_all() to keep the VMA-specific helper consistent with the VMA descriptor naming convention and to help avoid confusion vs. vma_flags_test_all(). While we're here, also update whitespace to be consistent in helper functions. Link: https://lkml.kernel.org/r/cover.1772704455.git.ljs@kernel.org Link: https://lkml.kernel.org/r/0f9cb3c511c478344fac0b3b3b0300bb95be95e9.1772704455.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Suggested-by: Pedro Falcato Acked-by: David Hildenbrand (Arm) Reviewed-by: Pedro Falcato Cc: Arnd Bergmann Cc: Babu Moger Cc: Baolin Wang Cc: Chao Yu Cc: Chatre, Reinette Cc: Chunhai Guo Cc: Damien Le Maol Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Hongbo Li Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jeffle Xu Cc: Johannes Thumshirn Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naohiro Aota Cc: Oscar Salvador Cc: Sandeep Dhavale Cc: Suren Baghdasaryan Cc: Vishal Verma Cc: Vlastimil Babka Cc: Yue Hu Signed-off-by: Andrew Morton --- drivers/char/mem.c | 2 +- drivers/dax/device.c | 2 +- fs/erofs/data.c | 4 ++-- fs/hugetlbfs/inode.c | 2 +- fs/ntfs3/file.c | 2 +- fs/resctrl/pseudo_lock.c | 2 +- fs/zonefs/file.c | 4 ++-- include/linux/dax.h | 4 ++-- include/linux/hugetlb_inline.h | 2 +- include/linux/mm.h | 48 +++++++++++++++++++++-------------------- mm/hugetlb.c | 14 ++++++------ mm/memory.c | 2 +- mm/secretmem.c | 2 +- mm/shmem.c | 4 ++-- tools/testing/vma/include/dup.h | 20 ++++++++--------- tools/testing/vma/tests/vma.c | 28 ++++++++++++------------ 16 files changed, 72 insertions(+), 70 deletions(-) (limited to 'include') diff --git a/drivers/char/mem.c b/drivers/char/mem.c index cca4529431f8..5118787d0954 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -520,7 +520,7 @@ static int mmap_zero_prepare(struct vm_area_desc *desc) #ifndef CONFIG_MMU return -ENOSYS; #endif - if (vma_desc_test_flags(desc, VMA_SHARED_BIT)) + if (vma_desc_test_any(desc, VMA_SHARED_BIT)) return shmem_zero_setup_desc(desc); desc->action.success_hook = mmap_zero_private_success; diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 528e81240c4d..381021c2e031 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -24,7 +24,7 @@ static int __check_vma(struct dev_dax *dev_dax, vma_flags_t flags, return -ENXIO; /* prevent private mappings from being established */ - if (!vma_flags_test(&flags, VMA_MAYSHARE_BIT)) { + if (!vma_flags_test_any(&flags, VMA_MAYSHARE_BIT)) { dev_info_ratelimited(dev, "%s: %s: fail, attempted private mapping\n", current->comm, func); diff --git a/fs/erofs/data.c b/fs/erofs/data.c index f79ee80627d9..6774d9b5ee82 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -473,8 +473,8 @@ static int erofs_file_mmap_prepare(struct vm_area_desc *desc) if (!IS_DAX(file_inode(desc->file))) return generic_file_readonly_mmap_prepare(desc); - if (vma_desc_test_flags(desc, VMA_SHARED_BIT) && - vma_desc_test_flags(desc, VMA_MAYWRITE_BIT)) + if (vma_desc_test_any(desc, VMA_SHARED_BIT) && + vma_desc_test_any(desc, VMA_MAYWRITE_BIT)) return -EINVAL; desc->vm_ops = &erofs_dax_vm_ops; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 2ec3e4231252..079ffaaf1f6c 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -164,7 +164,7 @@ static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) goto out; ret = 0; - if (vma_desc_test_flags(desc, VMA_WRITE_BIT) && inode->i_size < len) + if (vma_desc_test_any(desc, VMA_WRITE_BIT) && inode->i_size < len) i_size_write(inode, len); out: inode_unlock(inode); diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 7eecf1e01f74..c5e2181f9f02 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -276,7 +276,7 @@ static int ntfs_file_mmap_prepare(struct vm_area_desc *desc) struct file *file = desc->file; struct inode *inode = file_inode(file); struct ntfs_inode *ni = ntfs_i(inode); - const bool rw = vma_desc_test_flags(desc, VMA_WRITE_BIT); + const bool rw = vma_desc_test_any(desc, VMA_WRITE_BIT); int err; /* Avoid any operation if inode is bad. */ diff --git a/fs/resctrl/pseudo_lock.c b/fs/resctrl/pseudo_lock.c index fa3687d69ebd..79a006c6f26c 100644 --- a/fs/resctrl/pseudo_lock.c +++ b/fs/resctrl/pseudo_lock.c @@ -1044,7 +1044,7 @@ static int pseudo_lock_dev_mmap_prepare(struct vm_area_desc *desc) * Ensure changes are carried directly to the memory being mapped, * do not allow copy-on-write mapping. */ - if (!vma_desc_test_flags(desc, VMA_SHARED_BIT)) { + if (!vma_desc_test_any(desc, VMA_SHARED_BIT)) { mutex_unlock(&rdtgroup_mutex); return -EINVAL; } diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 8a7161fc49e5..9f9273ecf71a 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -333,8 +333,8 @@ static int zonefs_file_mmap_prepare(struct vm_area_desc *desc) * ordering between msync() and page cache writeback. */ if (zonefs_inode_is_seq(file_inode(file)) && - vma_desc_test_flags(desc, VMA_SHARED_BIT) && - vma_desc_test_flags(desc, VMA_MAYWRITE_BIT)) + vma_desc_test_any(desc, VMA_SHARED_BIT) && + vma_desc_test_any(desc, VMA_MAYWRITE_BIT)) return -EINVAL; file_accessed(file); diff --git a/include/linux/dax.h b/include/linux/dax.h index bf103f317cac..535019001577 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -69,7 +69,7 @@ static inline bool daxdev_mapping_supported(const struct vm_area_desc *desc, const struct inode *inode, struct dax_device *dax_dev) { - if (!vma_desc_test_flags(desc, VMA_SYNC_BIT)) + if (!vma_desc_test_any(desc, VMA_SYNC_BIT)) return true; if (!IS_DAX(inode)) return false; @@ -115,7 +115,7 @@ static inline bool daxdev_mapping_supported(const struct vm_area_desc *desc, const struct inode *inode, struct dax_device *dax_dev) { - return !vma_desc_test_flags(desc, VMA_SYNC_BIT); + return !vma_desc_test_any(desc, VMA_SYNC_BIT); } static inline size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h index 593f5d4e108b..84afc3c3e2e4 100644 --- a/include/linux/hugetlb_inline.h +++ b/include/linux/hugetlb_inline.h @@ -13,7 +13,7 @@ static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) static inline bool is_vma_hugetlb_flags(const vma_flags_t *flags) { - return vma_flags_test(flags, VMA_HUGETLB_BIT); + return vma_flags_test_any(flags, VMA_HUGETLB_BIT); } #else diff --git a/include/linux/mm.h b/include/linux/mm.h index c516d5177211..ee7671d6c5eb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1062,7 +1062,7 @@ static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits) (const vma_flag_t []){__VA_ARGS__}) /* Test each of to_test flags in flags, non-atomically. */ -static __always_inline bool vma_flags_test_mask(const vma_flags_t *flags, +static __always_inline bool vma_flags_test_any_mask(const vma_flags_t *flags, vma_flags_t to_test) { const unsigned long *bitmap = flags->__vma_flags; @@ -1074,10 +1074,10 @@ static __always_inline bool vma_flags_test_mask(const vma_flags_t *flags, /* * Test whether any specified VMA flag is set, e.g.: * - * if (vma_flags_test(flags, VMA_READ_BIT, VMA_MAYREAD_BIT)) { ... } + * if (vma_flags_test_any(flags, VMA_READ_BIT, VMA_MAYREAD_BIT)) { ... } */ -#define vma_flags_test(flags, ...) \ - vma_flags_test_mask(flags, mk_vma_flags(__VA_ARGS__)) +#define vma_flags_test_any(flags, ...) \ + vma_flags_test_any_mask(flags, mk_vma_flags(__VA_ARGS__)) /* Test that ALL of the to_test flags are set, non-atomically. */ static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags, @@ -1098,7 +1098,8 @@ static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags, vma_flags_test_all_mask(flags, mk_vma_flags(__VA_ARGS__)) /* Set each of the to_set flags in flags, non-atomically. */ -static __always_inline void vma_flags_set_mask(vma_flags_t *flags, vma_flags_t to_set) +static __always_inline void vma_flags_set_mask(vma_flags_t *flags, + vma_flags_t to_set) { unsigned long *bitmap = flags->__vma_flags; const unsigned long *bitmap_to_set = to_set.__vma_flags; @@ -1115,7 +1116,8 @@ static __always_inline void vma_flags_set_mask(vma_flags_t *flags, vma_flags_t t vma_flags_set_mask(flags, mk_vma_flags(__VA_ARGS__)) /* Clear all of the to-clear flags in flags, non-atomically. */ -static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, vma_flags_t to_clear) +static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, + vma_flags_t to_clear) { unsigned long *bitmap = flags->__vma_flags; const unsigned long *bitmap_to_clear = to_clear.__vma_flags; @@ -1137,8 +1139,8 @@ static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, vma_flags_t * Note: appropriate locks must be held, this function does not acquire them for * you. */ -static inline bool vma_test_all_flags_mask(const struct vm_area_struct *vma, - vma_flags_t flags) +static inline bool vma_test_all_mask(const struct vm_area_struct *vma, + vma_flags_t flags) { return vma_flags_test_all_mask(&vma->flags, flags); } @@ -1146,10 +1148,10 @@ static inline bool vma_test_all_flags_mask(const struct vm_area_struct *vma, /* * Helper macro for checking that ALL specified flags are set in a VMA, e.g.: * - * if (vma_test_all_flags(vma, VMA_READ_BIT, VMA_MAYREAD_BIT) { ... } + * if (vma_test_all(vma, VMA_READ_BIT, VMA_MAYREAD_BIT) { ... } */ -#define vma_test_all_flags(vma, ...) \ - vma_test_all_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) +#define vma_test_all(vma, ...) \ + vma_test_all_mask(vma, mk_vma_flags(__VA_ARGS__)) /* * Helper to set all VMA flags in a VMA. @@ -1158,7 +1160,7 @@ static inline bool vma_test_all_flags_mask(const struct vm_area_struct *vma, * you. */ static inline void vma_set_flags_mask(struct vm_area_struct *vma, - vma_flags_t flags) + vma_flags_t flags) { vma_flags_set_mask(&vma->flags, flags); } @@ -1176,25 +1178,25 @@ static inline void vma_set_flags_mask(struct vm_area_struct *vma, vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) /* Helper to test all VMA flags in a VMA descriptor. */ -static inline bool vma_desc_test_flags_mask(const struct vm_area_desc *desc, - vma_flags_t flags) +static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, + vma_flags_t flags) { - return vma_flags_test_mask(&desc->vma_flags, flags); + return vma_flags_test_any_mask(&desc->vma_flags, flags); } /* * Helper macro for testing VMA flags for an input pointer to a struct * vm_area_desc object describing a proposed VMA, e.g.: * - * if (vma_desc_test_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, + * if (vma_desc_test_any(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, * VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT)) { ... } */ -#define vma_desc_test_flags(desc, ...) \ - vma_desc_test_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) +#define vma_desc_test_any(desc, ...) \ + vma_desc_test_any_mask(desc, mk_vma_flags(__VA_ARGS__)) /* Helper to set all VMA flags in a VMA descriptor. */ static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, - vma_flags_t flags) + vma_flags_t flags) { vma_flags_set_mask(&desc->vma_flags, flags); } @@ -1211,7 +1213,7 @@ static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, /* Helper to clear all VMA flags in a VMA descriptor. */ static inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc, - vma_flags_t flags) + vma_flags_t flags) { vma_flags_clear_mask(&desc->vma_flags, flags); } @@ -1936,8 +1938,8 @@ static inline bool vma_desc_is_cow_mapping(struct vm_area_desc *desc) { const vma_flags_t *flags = &desc->vma_flags; - return vma_flags_test(flags, VMA_MAYWRITE_BIT) && - !vma_flags_test(flags, VMA_SHARED_BIT); + return vma_flags_test_any(flags, VMA_MAYWRITE_BIT) && + !vma_flags_test_any(flags, VMA_SHARED_BIT); } #ifndef CONFIG_MMU @@ -1956,7 +1958,7 @@ static inline bool is_nommu_shared_mapping(vm_flags_t flags) static inline bool is_nommu_shared_vma_flags(const vma_flags_t *flags) { - return vma_flags_test(flags, VMA_MAYSHARE_BIT, VMA_MAYOVERLAY_BIT); + return vma_flags_test_any(flags, VMA_MAYSHARE_BIT, VMA_MAYOVERLAY_BIT); } #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1d41fa3dd43e..fbbe74f94426 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1194,7 +1194,7 @@ static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *map) { VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags)); - VM_WARN_ON_ONCE(vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)); + VM_WARN_ON_ONCE(vma_desc_test_any(desc, VMA_MAYSHARE_BIT)); desc->private_data = map; } @@ -1202,7 +1202,7 @@ static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *ma static void set_vma_desc_resv_flags(struct vm_area_desc *desc, unsigned long flags) { VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags)); - VM_WARN_ON_ONCE(vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)); + VM_WARN_ON_ONCE(vma_desc_test_any(desc, VMA_MAYSHARE_BIT)); desc->private_data = (void *)((unsigned long)desc->private_data | flags); } @@ -6593,7 +6593,7 @@ long hugetlb_reserve_pages(struct inode *inode, * attempt will be made for VM_NORESERVE to allocate a page * without using reserves */ - if (vma_flags_test(&vma_flags, VMA_NORESERVE_BIT)) + if (vma_flags_test_any(&vma_flags, VMA_NORESERVE_BIT)) return 0; /* @@ -6602,7 +6602,7 @@ long hugetlb_reserve_pages(struct inode *inode, * to reserve the full area even if read-only as mprotect() may be * called to make the mapping read-write. Assume !desc is a shm mapping */ - if (!desc || vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)) { + if (!desc || vma_desc_test_any(desc, VMA_MAYSHARE_BIT)) { /* * resv_map can not be NULL as hugetlb_reserve_pages is only * called for inodes for which resv_maps were created (see @@ -6636,7 +6636,7 @@ long hugetlb_reserve_pages(struct inode *inode, if (err < 0) goto out_err; - if (desc && !vma_desc_test_flags(desc, VMA_MAYSHARE_BIT) && h_cg) { + if (desc && !vma_desc_test_any(desc, VMA_MAYSHARE_BIT) && h_cg) { /* For private mappings, the hugetlb_cgroup uncharge info hangs * of the resv_map. */ @@ -6673,7 +6673,7 @@ long hugetlb_reserve_pages(struct inode *inode, * consumed reservations are stored in the map. Hence, nothing * else has to be done for private mappings here */ - if (!desc || vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)) { + if (!desc || vma_desc_test_any(desc, VMA_MAYSHARE_BIT)) { add = region_add(resv_map, from, to, regions_needed, h, h_cg); if (unlikely(add < 0)) { @@ -6737,7 +6737,7 @@ out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); out_err: - if (!desc || vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)) + if (!desc || vma_desc_test_any(desc, VMA_MAYSHARE_BIT)) /* Only call region_abort if the region_chg succeeded but the * region_add failed or didn't run. */ diff --git a/mm/memory.c b/mm/memory.c index b1c062bf5fc1..f21c804b50bf 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2982,7 +2982,7 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad if (WARN_ON_ONCE(!PAGE_ALIGNED(addr))) return -EINVAL; - VM_WARN_ON_ONCE(!vma_test_all_flags_mask(vma, VMA_REMAP_FLAGS)); + VM_WARN_ON_ONCE(!vma_test_all_mask(vma, VMA_REMAP_FLAGS)); BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; diff --git a/mm/secretmem.c b/mm/secretmem.c index 11a779c812a7..5f57ac4720d3 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -122,7 +122,7 @@ static int secretmem_mmap_prepare(struct vm_area_desc *desc) { const unsigned long len = vma_desc_size(desc); - if (!vma_desc_test_flags(desc, VMA_SHARED_BIT, VMA_MAYSHARE_BIT)) + if (!vma_desc_test_any(desc, VMA_SHARED_BIT, VMA_MAYSHARE_BIT)) return -EINVAL; vma_desc_set_flags(desc, VMA_LOCKED_BIT, VMA_DONTDUMP_BIT); diff --git a/mm/shmem.c b/mm/shmem.c index 5e7dcf5bc5d3..965a8908200b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3086,7 +3086,7 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, spin_lock_init(&info->lock); atomic_set(&info->stop_eviction, 0); info->seals = F_SEAL_SEAL; - info->flags = vma_flags_test(&flags, VMA_NORESERVE_BIT) + info->flags = vma_flags_test_any(&flags, VMA_NORESERVE_BIT) ? SHMEM_F_NORESERVE : 0; info->i_crtime = inode_get_mtime(inode); info->fsflags = (dir == NULL) ? 0 : @@ -5827,7 +5827,7 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, unsigned int i_flags) { const unsigned long shmem_flags = - vma_flags_test(&flags, VMA_NORESERVE_BIT) ? SHMEM_F_NORESERVE : 0; + vma_flags_test_any(&flags, VMA_NORESERVE_BIT) ? SHMEM_F_NORESERVE : 0; struct inode *inode; struct file *res; diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 3078ff1487d3..c46b523e428d 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -843,7 +843,7 @@ static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits); #define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \ (const vma_flag_t []){__VA_ARGS__}) -static __always_inline bool vma_flags_test_mask(const vma_flags_t *flags, +static __always_inline bool vma_flags_test_any_mask(const vma_flags_t *flags, vma_flags_t to_test) { const unsigned long *bitmap = flags->__vma_flags; @@ -852,8 +852,8 @@ static __always_inline bool vma_flags_test_mask(const vma_flags_t *flags, return bitmap_intersects(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS); } -#define vma_flags_test(flags, ...) \ - vma_flags_test_mask(flags, mk_vma_flags(__VA_ARGS__)) +#define vma_flags_test_any(flags, ...) \ + vma_flags_test_any_mask(flags, mk_vma_flags(__VA_ARGS__)) static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags, vma_flags_t to_test) @@ -889,14 +889,14 @@ static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, vma_flags_t #define vma_flags_clear(flags, ...) \ vma_flags_clear_mask(flags, mk_vma_flags(__VA_ARGS__)) -static inline bool vma_test_all_flags_mask(const struct vm_area_struct *vma, +static inline bool vma_test_all_mask(const struct vm_area_struct *vma, vma_flags_t flags) { return vma_flags_test_all_mask(&vma->flags, flags); } -#define vma_test_all_flags(vma, ...) \ - vma_test_all_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) +#define vma_test_all(vma, ...) \ + vma_test_all_mask(vma, mk_vma_flags(__VA_ARGS__)) static inline bool is_shared_maywrite_vm_flags(vm_flags_t vm_flags) { @@ -913,14 +913,14 @@ static inline void vma_set_flags_mask(struct vm_area_struct *vma, #define vma_set_flags(vma, ...) \ vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) -static inline bool vma_desc_test_flags_mask(const struct vm_area_desc *desc, +static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, vma_flags_t flags) { - return vma_flags_test_mask(&desc->vma_flags, flags); + return vma_flags_test_any_mask(&desc->vma_flags, flags); } -#define vma_desc_test_flags(desc, ...) \ - vma_desc_test_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) +#define vma_desc_test_any(desc, ...) \ + vma_desc_test_any_mask(desc, mk_vma_flags(__VA_ARGS__)) static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, vma_flags_t flags) diff --git a/tools/testing/vma/tests/vma.c b/tools/testing/vma/tests/vma.c index c54ffc954f11..f031e6dfb474 100644 --- a/tools/testing/vma/tests/vma.c +++ b/tools/testing/vma/tests/vma.c @@ -159,8 +159,8 @@ static bool test_vma_flags_word(void) return true; } -/* Ensure that vma_flags_test() and friends works correctly. */ -static bool test_vma_flags_test(void) +/* Ensure that vma_flags_test_any() and friends works correctly. */ +static bool test_vma_flags_test_any(void) { const vma_flags_t flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, 64, 65); @@ -171,16 +171,16 @@ static bool test_vma_flags_test(void) desc.vma_flags = flags; #define do_test(...) \ - ASSERT_TRUE(vma_flags_test(&flags, __VA_ARGS__)); \ - ASSERT_TRUE(vma_desc_test_flags(&desc, __VA_ARGS__)) + ASSERT_TRUE(vma_flags_test_any(&flags, __VA_ARGS__)); \ + ASSERT_TRUE(vma_desc_test_any(&desc, __VA_ARGS__)) #define do_test_all_true(...) \ ASSERT_TRUE(vma_flags_test_all(&flags, __VA_ARGS__)); \ - ASSERT_TRUE(vma_test_all_flags(&vma, __VA_ARGS__)) + ASSERT_TRUE(vma_test_all(&vma, __VA_ARGS__)) #define do_test_all_false(...) \ ASSERT_FALSE(vma_flags_test_all(&flags, __VA_ARGS__)); \ - ASSERT_FALSE(vma_test_all_flags(&vma, __VA_ARGS__)) + ASSERT_FALSE(vma_test_all(&vma, __VA_ARGS__)) /* * Testing for some flags that are present, some that are not - should @@ -200,7 +200,7 @@ static bool test_vma_flags_test(void) * Check _mask variant. We don't need to test extensively as macro * helper is the equivalent. */ - ASSERT_TRUE(vma_flags_test_mask(&flags, flags)); + ASSERT_TRUE(vma_flags_test_any_mask(&flags, flags)); ASSERT_TRUE(vma_flags_test_all_mask(&flags, flags)); /* Single bits. */ @@ -268,9 +268,9 @@ static bool test_vma_flags_clear(void) vma_flags_clear_mask(&flags, mask); vma_flags_clear_mask(&vma.flags, mask); vma_desc_clear_flags_mask(&desc, mask); - ASSERT_FALSE(vma_flags_test(&flags, VMA_EXEC_BIT, 64)); - ASSERT_FALSE(vma_flags_test(&vma.flags, VMA_EXEC_BIT, 64)); - ASSERT_FALSE(vma_desc_test_flags(&desc, VMA_EXEC_BIT, 64)); + ASSERT_FALSE(vma_flags_test_any(&flags, VMA_EXEC_BIT, 64)); + ASSERT_FALSE(vma_flags_test_any(&vma.flags, VMA_EXEC_BIT, 64)); + ASSERT_FALSE(vma_desc_test_any(&desc, VMA_EXEC_BIT, 64)); /* Reset. */ vma_flags_set(&flags, VMA_EXEC_BIT, 64); vma_set_flags(&vma, VMA_EXEC_BIT, 64); @@ -284,9 +284,9 @@ static bool test_vma_flags_clear(void) vma_flags_clear(&flags, __VA_ARGS__); \ vma_flags_clear(&vma.flags, __VA_ARGS__); \ vma_desc_clear_flags(&desc, __VA_ARGS__); \ - ASSERT_FALSE(vma_flags_test(&flags, __VA_ARGS__)); \ - ASSERT_FALSE(vma_flags_test(&vma.flags, __VA_ARGS__)); \ - ASSERT_FALSE(vma_desc_test_flags(&desc, __VA_ARGS__)); \ + ASSERT_FALSE(vma_flags_test_any(&flags, __VA_ARGS__)); \ + ASSERT_FALSE(vma_flags_test_any(&vma.flags, __VA_ARGS__)); \ + ASSERT_FALSE(vma_desc_test_any(&desc, __VA_ARGS__)); \ vma_flags_set(&flags, __VA_ARGS__); \ vma_set_flags(&vma, __VA_ARGS__); \ vma_desc_set_flags(&desc, __VA_ARGS__) @@ -334,6 +334,6 @@ static void run_vma_tests(int *num_tests, int *num_fail) TEST(vma_flags_unchanged); TEST(vma_flags_cleared); TEST(vma_flags_word); - TEST(vma_flags_test); + TEST(vma_flags_test_any); TEST(vma_flags_clear); } -- cgit v1.2.3 From 0b3ed2a495b5c10296d9371502d70ce4398f0c58 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Thu, 5 Mar 2026 10:50:15 +0000 Subject: mm: add vma_desc_test_all() and use it erofs and zonefs are using vma_desc_test_any() twice to check whether all of VMA_SHARED_BIT and VMA_MAYWRITE_BIT are set, this is silly, so add vma_desc_test_all() to test all flags and update erofs and zonefs to use it. While we're here, update the helper function comments to be more consistent. Also add the same to the VMA test headers. Link: https://lkml.kernel.org/r/568c8f8d6a84ff64014f997517cba7a629f7eed6.1772704455.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Vlastimil Babka (SUSE) Acked-by: David Hildenbrand (Arm) Reviewed-by: Pedro Falcato Cc: Arnd Bergmann Cc: Babu Moger Cc: Baolin Wang Cc: Chao Yu Cc: Chatre, Reinette Cc: Chunhai Guo Cc: Damien Le Maol Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Hongbo Li Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jeffle Xu Cc: Johannes Thumshirn Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naohiro Aota Cc: Oscar Salvador Cc: Sandeep Dhavale Cc: Suren Baghdasaryan Cc: Vishal Verma Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/erofs/data.c | 3 +-- fs/zonefs/file.c | 3 +-- include/linux/mm.h | 24 ++++++++++++++++++++---- tools/testing/vma/include/dup.h | 9 +++++++++ 4 files changed, 31 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 6774d9b5ee82..b33dd4d8710e 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -473,8 +473,7 @@ static int erofs_file_mmap_prepare(struct vm_area_desc *desc) if (!IS_DAX(file_inode(desc->file))) return generic_file_readonly_mmap_prepare(desc); - if (vma_desc_test_any(desc, VMA_SHARED_BIT) && - vma_desc_test_any(desc, VMA_MAYWRITE_BIT)) + if (vma_desc_test_all(desc, VMA_SHARED_BIT, VMA_MAYWRITE_BIT)) return -EINVAL; desc->vm_ops = &erofs_dax_vm_ops; diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 9f9273ecf71a..5ada33f70bb4 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -333,8 +333,7 @@ static int zonefs_file_mmap_prepare(struct vm_area_desc *desc) * ordering between msync() and page cache writeback. */ if (zonefs_inode_is_seq(file_inode(file)) && - vma_desc_test_any(desc, VMA_SHARED_BIT) && - vma_desc_test_any(desc, VMA_MAYWRITE_BIT)) + vma_desc_test_all(desc, VMA_SHARED_BIT, VMA_MAYWRITE_BIT)) return -EINVAL; file_accessed(file); diff --git a/include/linux/mm.h b/include/linux/mm.h index ee7671d6c5eb..f964e4050583 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1177,7 +1177,7 @@ static inline void vma_set_flags_mask(struct vm_area_struct *vma, #define vma_set_flags(vma, ...) \ vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) -/* Helper to test all VMA flags in a VMA descriptor. */ +/* Helper to test any VMA flags in a VMA descriptor. */ static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, vma_flags_t flags) { @@ -1185,8 +1185,8 @@ static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, } /* - * Helper macro for testing VMA flags for an input pointer to a struct - * vm_area_desc object describing a proposed VMA, e.g.: + * Helper macro for testing whether any VMA flags are set in a VMA descriptor, + * e.g.: * * if (vma_desc_test_any(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, * VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT)) { ... } @@ -1194,6 +1194,22 @@ static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, #define vma_desc_test_any(desc, ...) \ vma_desc_test_any_mask(desc, mk_vma_flags(__VA_ARGS__)) +/* Helper to test all VMA flags in a VMA descriptor. */ +static inline bool vma_desc_test_all_mask(const struct vm_area_desc *desc, + vma_flags_t flags) +{ + return vma_flags_test_all_mask(&desc->vma_flags, flags); +} + +/* + * Helper macro for testing whether ALL VMA flags are set in a VMA descriptor, + * e.g.: + * + * if (vma_desc_test_all(desc, VMA_READ_BIT, VMA_MAYREAD_BIT)) { ... } + */ +#define vma_desc_test_all(desc, ...) \ + vma_desc_test_all_mask(desc, mk_vma_flags(__VA_ARGS__)) + /* Helper to set all VMA flags in a VMA descriptor. */ static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, vma_flags_t flags) @@ -1206,7 +1222,7 @@ static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, * vm_area_desc object describing a proposed VMA, e.g.: * * vma_desc_set_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT, - * VMA_DONTDUMP_BIT); + * VMA_DONTDUMP_BIT); */ #define vma_desc_set_flags(desc, ...) \ vma_desc_set_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index c46b523e428d..59788bc14d75 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -922,6 +922,15 @@ static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, #define vma_desc_test_any(desc, ...) \ vma_desc_test_any_mask(desc, mk_vma_flags(__VA_ARGS__)) +static inline bool vma_desc_test_all_mask(const struct vm_area_desc *desc, + vma_flags_t flags) +{ + return vma_flags_test_all_mask(&desc->vma_flags, flags); +} + +#define vma_desc_test_all(desc, ...) \ + vma_desc_test_all_mask(desc, mk_vma_flags(__VA_ARGS__)) + static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, vma_flags_t flags) { -- cgit v1.2.3 From a5eee1128de526ba199bd4c7be39b849223e5001 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Thu, 5 Mar 2026 10:50:16 +0000 Subject: mm: always inline __mk_vma_flags() and invoked functions Be explicit about __mk_vma_flags() (which is used by the mk_vma_flags() macro) always being inline, as we rely on the compiler to evaluate the loop in this function and determine that it can replace the code with the an equivalent constant value, e.g. that: __mk_vma_flags(2, (const vma_flag_t []){ VMA_WRITE_BIT, VMA_EXEC_BIT }); Can be replaced with: (1UL << VMA_WRITE_BIT) | (1UL << VMA_EXEC_BIT) = (1UL << 1) | (1UL << 2) = 6 Most likely an 'inline' will suffice for this, but be explicit as we can be. Also update all of the functions __mk_vma_flags() ultimately invokes to be always inline too. Note that test_bitmap_const_eval() asserts that the relevant bitmap functions result in build time constant values. Additionally, vma_flag_set() operates on a vma_flags_t type, so it is inconsistently named versus other VMA flags functions. We only use vma_flag_set() in __mk_vma_flags() so we don't need to worry about its new name being rather cumbersome, so rename it to vma_flags_set_flag() to disambiguate it from vma_flags_set(). Also update the VMA test headers to reflect the changes. Link: https://lkml.kernel.org/r/241f49c52074d436edbb9c6a6662a8dc142a8f43.1772704455.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Reviewed-by: Pedro Falcato Cc: Arnd Bergmann Cc: Babu Moger Cc: Baolin Wang Cc: Chao Yu Cc: Chatre, Reinette Cc: Chunhai Guo Cc: Damien Le Maol Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Hongbo Li Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jeffle Xu Cc: Johannes Thumshirn Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naohiro Aota Cc: Oscar Salvador Cc: Sandeep Dhavale Cc: Suren Baghdasaryan Cc: Vishal Verma Cc: Vlastimil Babka Cc: Yue Hu Signed-off-by: Andrew Morton --- include/linux/mm.h | 8 +++++--- include/linux/mm_types.h | 2 +- tools/testing/vma/include/custom.h | 5 +++-- tools/testing/vma/include/dup.h | 5 +++-- 4 files changed, 12 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index f964e4050583..9dcdf13570fb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1030,21 +1030,23 @@ static inline bool vma_test_atomic_flag(struct vm_area_struct *vma, vma_flag_t b } /* Set an individual VMA flag in flags, non-atomically. */ -static inline void vma_flag_set(vma_flags_t *flags, vma_flag_t bit) +static __always_inline void vma_flags_set_flag(vma_flags_t *flags, + vma_flag_t bit) { unsigned long *bitmap = flags->__vma_flags; __set_bit((__force int)bit, bitmap); } -static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits) +static __always_inline vma_flags_t __mk_vma_flags(size_t count, + const vma_flag_t *bits) { vma_flags_t flags; int i; vma_flags_clear_all(&flags); for (i = 0; i < count; i++) - vma_flag_set(&flags, bits[i]); + vma_flags_set_flag(&flags, bits[i]); return flags; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7bc82a2b889f..f22aecb047b7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1056,7 +1056,7 @@ struct vm_area_struct { } __randomize_layout; /* Clears all bits in the VMA flags bitmap, non-atomically. */ -static inline void vma_flags_clear_all(vma_flags_t *flags) +static __always_inline void vma_flags_clear_all(vma_flags_t *flags) { bitmap_zero(flags->__vma_flags, NUM_VMA_FLAG_BITS); } diff --git a/tools/testing/vma/include/custom.h b/tools/testing/vma/include/custom.h index 802a76317245..833ff4d7f799 100644 --- a/tools/testing/vma/include/custom.h +++ b/tools/testing/vma/include/custom.h @@ -102,7 +102,8 @@ static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) refcount_set(&vma->vm_refcnt, 0); } -static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits) +static __always_inline vma_flags_t __mk_vma_flags(size_t count, + const vma_flag_t *bits) { vma_flags_t flags; int i; @@ -114,6 +115,6 @@ static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits) vma_flags_clear_all(&flags); for (i = 0; i < count; i++) if (bits[i] < NUM_VMA_FLAG_BITS) - vma_flag_set(&flags, bits[i]); + vma_flags_set_flag(&flags, bits[i]); return flags; } diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 59788bc14d75..ef6b9d963acc 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -780,12 +780,13 @@ static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value) *bitmap &= ~value; } -static inline void vma_flags_clear_all(vma_flags_t *flags) +static __always_inline void vma_flags_clear_all(vma_flags_t *flags) { bitmap_zero(ACCESS_PRIVATE(flags, __vma_flags), NUM_VMA_FLAG_BITS); } -static inline void vma_flag_set(vma_flags_t *flags, vma_flag_t bit) +static __always_inline void vma_flags_set_flag(vma_flags_t *flags, + vma_flag_t bit) { unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); -- cgit v1.2.3 From 5e6d45d720ca299cc82d84948c4ba622fff64f22 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Thu, 5 Mar 2026 10:50:17 +0000 Subject: mm: reintroduce vma_flags_test() as a singular flag test Since we've now renamed vma_flags_test() to vma_flags_test_any() to be very clear as to what we are in fact testing, we now have the opportunity to bring vma_flags_test() back, but for explicitly testing a single VMA flag. This is useful, as often flag tests are against a single flag, and vma_flags_test_any(flags, VMA_READ_BIT) reads oddly and potentially causes confusion. We use sparse to enforce that users won't accidentally pass vm_flags_t to this function without it being flagged so this should make it harder to get this wrong. Of course, passing vma_flags_t to the function is impossible, as it is a struct. Also update the VMA tests to reflect this change. Link: https://lkml.kernel.org/r/f33f8d7f16c3f3d286a1dc2cba12c23683073134.1772704455.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Reviewed-by: Pedro Falcato Cc: Arnd Bergmann Cc: Babu Moger Cc: Baolin Wang Cc: Chao Yu Cc: Chatre, Reinette Cc: Chunhai Guo Cc: Damien Le Maol Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Hongbo Li Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jeffle Xu Cc: Johannes Thumshirn Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naohiro Aota Cc: Oscar Salvador Cc: Sandeep Dhavale Cc: Suren Baghdasaryan Cc: Vishal Verma Cc: Vlastimil Babka Cc: Yue Hu Signed-off-by: Andrew Morton --- include/linux/mm.h | 17 +++++++++++++++-- mm/hugetlb.c | 2 +- mm/shmem.c | 4 ++-- tools/testing/vma/include/dup.h | 8 ++++++++ 4 files changed, 26 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9dcdf13570fb..9392723a5c50 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1050,6 +1050,19 @@ static __always_inline vma_flags_t __mk_vma_flags(size_t count, return flags; } +/* + * Test whether a specific VMA flag is set, e.g.: + * + * if (vma_flags_test(flags, VMA_READ_BIT)) { ... } + */ +static __always_inline bool vma_flags_test(const vma_flags_t *flags, + vma_flag_t bit) +{ + const unsigned long *bitmap = flags->__vma_flags; + + return test_bit((__force int)bit, bitmap); +} + /* * Helper macro which bitwise-or combines the specified input flags into a * vma_flags_t bitmap value. E.g.: @@ -1956,8 +1969,8 @@ static inline bool vma_desc_is_cow_mapping(struct vm_area_desc *desc) { const vma_flags_t *flags = &desc->vma_flags; - return vma_flags_test_any(flags, VMA_MAYWRITE_BIT) && - !vma_flags_test_any(flags, VMA_SHARED_BIT); + return vma_flags_test(flags, VMA_MAYWRITE_BIT) && + !vma_flags_test(flags, VMA_SHARED_BIT); } #ifndef CONFIG_MMU diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fbbe74f94426..9363b6072c0a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6593,7 +6593,7 @@ long hugetlb_reserve_pages(struct inode *inode, * attempt will be made for VM_NORESERVE to allocate a page * without using reserves */ - if (vma_flags_test_any(&vma_flags, VMA_NORESERVE_BIT)) + if (vma_flags_test(&vma_flags, VMA_NORESERVE_BIT)) return 0; /* diff --git a/mm/shmem.c b/mm/shmem.c index 965a8908200b..5e7dcf5bc5d3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3086,7 +3086,7 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, spin_lock_init(&info->lock); atomic_set(&info->stop_eviction, 0); info->seals = F_SEAL_SEAL; - info->flags = vma_flags_test_any(&flags, VMA_NORESERVE_BIT) + info->flags = vma_flags_test(&flags, VMA_NORESERVE_BIT) ? SHMEM_F_NORESERVE : 0; info->i_crtime = inode_get_mtime(inode); info->fsflags = (dir == NULL) ? 0 : @@ -5827,7 +5827,7 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, unsigned int i_flags) { const unsigned long shmem_flags = - vma_flags_test_any(&flags, VMA_NORESERVE_BIT) ? SHMEM_F_NORESERVE : 0; + vma_flags_test(&flags, VMA_NORESERVE_BIT) ? SHMEM_F_NORESERVE : 0; struct inode *inode; struct file *res; diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index ef6b9d963acc..630478f0d583 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -844,6 +844,14 @@ static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits); #define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \ (const vma_flag_t []){__VA_ARGS__}) +static __always_inline bool vma_flags_test(const vma_flags_t *flags, + vma_flag_t bit) +{ + const unsigned long *bitmap = flags->__vma_flags; + + return test_bit((__force int)bit, bitmap); +} + static __always_inline bool vma_flags_test_any_mask(const vma_flags_t *flags, vma_flags_t to_test) { -- cgit v1.2.3 From 0c2aa6635716a5aa19576deef062efab5322072f Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Thu, 5 Mar 2026 10:50:18 +0000 Subject: mm: reintroduce vma_desc_test() as a singular flag test Similar to vma_flags_test(), we have previously renamed vma_desc_test() to vma_desc_test_any(). Now that is in place, we can reintroduce vma_desc_test() to explicitly check for a single VMA flag. As with vma_flags_test(), this is useful as often flag tests are against a single flag, and vma_desc_test_any(flags, VMA_READ_BIT) reads oddly and potentially causes confusion. As with vma_flags_test() a combination of sparse and vma_flags_t being a struct means that users cannot misuse this function without it getting flagged. Also update the VMA tests to reflect this change. Link: https://lkml.kernel.org/r/3a65ca23defb05060333f0586428fe279a484564.1772704455.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Reviewed-by: Pedro Falcato Cc: Arnd Bergmann Cc: Babu Moger Cc: Baolin Wang Cc: Chao Yu Cc: Chatre, Reinette Cc: Chunhai Guo Cc: Damien Le Maol Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Hongbo Li Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jeffle Xu Cc: Johannes Thumshirn Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naohiro Aota Cc: Oscar Salvador Cc: Sandeep Dhavale Cc: Suren Baghdasaryan Cc: Vishal Verma Cc: Vlastimil Babka Cc: Yue Hu Signed-off-by: Andrew Morton --- drivers/char/mem.c | 2 +- fs/hugetlbfs/inode.c | 2 +- fs/ntfs3/file.c | 2 +- fs/resctrl/pseudo_lock.c | 2 +- include/linux/dax.h | 4 ++-- include/linux/mm.h | 11 +++++++++++ mm/hugetlb.c | 12 ++++++------ tools/testing/vma/include/dup.h | 6 ++++++ 8 files changed, 29 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 5118787d0954..5fd421e48c04 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -520,7 +520,7 @@ static int mmap_zero_prepare(struct vm_area_desc *desc) #ifndef CONFIG_MMU return -ENOSYS; #endif - if (vma_desc_test_any(desc, VMA_SHARED_BIT)) + if (vma_desc_test(desc, VMA_SHARED_BIT)) return shmem_zero_setup_desc(desc); desc->action.success_hook = mmap_zero_private_success; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 079ffaaf1f6c..cd6b22f6e2b1 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -164,7 +164,7 @@ static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) goto out; ret = 0; - if (vma_desc_test_any(desc, VMA_WRITE_BIT) && inode->i_size < len) + if (vma_desc_test(desc, VMA_WRITE_BIT) && inode->i_size < len) i_size_write(inode, len); out: inode_unlock(inode); diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index c5e2181f9f02..fbdfaf989a31 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -276,7 +276,7 @@ static int ntfs_file_mmap_prepare(struct vm_area_desc *desc) struct file *file = desc->file; struct inode *inode = file_inode(file); struct ntfs_inode *ni = ntfs_i(inode); - const bool rw = vma_desc_test_any(desc, VMA_WRITE_BIT); + const bool rw = vma_desc_test(desc, VMA_WRITE_BIT); int err; /* Avoid any operation if inode is bad. */ diff --git a/fs/resctrl/pseudo_lock.c b/fs/resctrl/pseudo_lock.c index 79a006c6f26c..d1cb0986006e 100644 --- a/fs/resctrl/pseudo_lock.c +++ b/fs/resctrl/pseudo_lock.c @@ -1044,7 +1044,7 @@ static int pseudo_lock_dev_mmap_prepare(struct vm_area_desc *desc) * Ensure changes are carried directly to the memory being mapped, * do not allow copy-on-write mapping. */ - if (!vma_desc_test_any(desc, VMA_SHARED_BIT)) { + if (!vma_desc_test(desc, VMA_SHARED_BIT)) { mutex_unlock(&rdtgroup_mutex); return -EINVAL; } diff --git a/include/linux/dax.h b/include/linux/dax.h index 535019001577..10a7cc79aea5 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -69,7 +69,7 @@ static inline bool daxdev_mapping_supported(const struct vm_area_desc *desc, const struct inode *inode, struct dax_device *dax_dev) { - if (!vma_desc_test_any(desc, VMA_SYNC_BIT)) + if (!vma_desc_test(desc, VMA_SYNC_BIT)) return true; if (!IS_DAX(inode)) return false; @@ -115,7 +115,7 @@ static inline bool daxdev_mapping_supported(const struct vm_area_desc *desc, const struct inode *inode, struct dax_device *dax_dev) { - return !vma_desc_test_any(desc, VMA_SYNC_BIT); + return !vma_desc_test(desc, VMA_SYNC_BIT); } static inline size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) diff --git a/include/linux/mm.h b/include/linux/mm.h index 9392723a5c50..63d1f619260e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1192,6 +1192,17 @@ static inline void vma_set_flags_mask(struct vm_area_struct *vma, #define vma_set_flags(vma, ...) \ vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) +/* + * Test whether a specific VMA flag is set in a VMA descriptor, e.g.: + * + * if (vma_desc_test(desc, VMA_READ_BIT)) { ... } + */ +static __always_inline bool vma_desc_test(const struct vm_area_desc *desc, + vma_flag_t bit) +{ + return vma_flags_test(&desc->vma_flags, bit); +} + /* Helper to test any VMA flags in a VMA descriptor. */ static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, vma_flags_t flags) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9363b6072c0a..992c1632d26a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1194,7 +1194,7 @@ static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *map) { VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags)); - VM_WARN_ON_ONCE(vma_desc_test_any(desc, VMA_MAYSHARE_BIT)); + VM_WARN_ON_ONCE(vma_desc_test(desc, VMA_MAYSHARE_BIT)); desc->private_data = map; } @@ -1202,7 +1202,7 @@ static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *ma static void set_vma_desc_resv_flags(struct vm_area_desc *desc, unsigned long flags) { VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags)); - VM_WARN_ON_ONCE(vma_desc_test_any(desc, VMA_MAYSHARE_BIT)); + VM_WARN_ON_ONCE(vma_desc_test(desc, VMA_MAYSHARE_BIT)); desc->private_data = (void *)((unsigned long)desc->private_data | flags); } @@ -6602,7 +6602,7 @@ long hugetlb_reserve_pages(struct inode *inode, * to reserve the full area even if read-only as mprotect() may be * called to make the mapping read-write. Assume !desc is a shm mapping */ - if (!desc || vma_desc_test_any(desc, VMA_MAYSHARE_BIT)) { + if (!desc || vma_desc_test(desc, VMA_MAYSHARE_BIT)) { /* * resv_map can not be NULL as hugetlb_reserve_pages is only * called for inodes for which resv_maps were created (see @@ -6636,7 +6636,7 @@ long hugetlb_reserve_pages(struct inode *inode, if (err < 0) goto out_err; - if (desc && !vma_desc_test_any(desc, VMA_MAYSHARE_BIT) && h_cg) { + if (desc && !vma_desc_test(desc, VMA_MAYSHARE_BIT) && h_cg) { /* For private mappings, the hugetlb_cgroup uncharge info hangs * of the resv_map. */ @@ -6673,7 +6673,7 @@ long hugetlb_reserve_pages(struct inode *inode, * consumed reservations are stored in the map. Hence, nothing * else has to be done for private mappings here */ - if (!desc || vma_desc_test_any(desc, VMA_MAYSHARE_BIT)) { + if (!desc || vma_desc_test(desc, VMA_MAYSHARE_BIT)) { add = region_add(resv_map, from, to, regions_needed, h, h_cg); if (unlikely(add < 0)) { @@ -6737,7 +6737,7 @@ out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); out_err: - if (!desc || vma_desc_test_any(desc, VMA_MAYSHARE_BIT)) + if (!desc || vma_desc_test(desc, VMA_MAYSHARE_BIT)) /* Only call region_abort if the region_chg succeeded but the * region_add failed or didn't run. */ diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 630478f0d583..5eb313beb43d 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -922,6 +922,12 @@ static inline void vma_set_flags_mask(struct vm_area_struct *vma, #define vma_set_flags(vma, ...) \ vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) +static __always_inline bool vma_desc_test(const struct vm_area_desc *desc, + vma_flag_t bit) +{ + return vma_flags_test(&desc->vma_flags, bit); +} + static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, vma_flags_t flags) { -- cgit v1.2.3 From db359fccf212e7fa3136e6edbed6228475646fd7 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Tue, 24 Feb 2026 14:13:47 +0900 Subject: mm: introduce a new page type for page pool in page type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the condition 'page->pp_magic == PP_SIGNATURE' is used to determine if a page belongs to a page pool. However, with the planned removal of @pp_magic, we should instead leverage the page_type in struct page, such as PGTY_netpp, for this purpose. Introduce and use the page type APIs e.g. PageNetpp(), __SetPageNetpp(), and __ClearPageNetpp() instead, and remove the existing APIs accessing @pp_magic e.g. page_pool_page_is_pp(), netmem_or_pp_magic(), and netmem_clear_pp_magic(). Plus, add @page_type to struct net_iov at the same offset as struct page so as to use the page_type APIs for struct net_iov as well. While at it, reorder @type and @owner in struct net_iov to avoid a hole and increasing the struct size. This work was inspired by the following link: https://lore.kernel.org/all/582f41c0-2742-4400-9c81-0d46bf4e8314@gmail.com/ While at it, move the sanity check for page pool to on the free path. [byungchul@sk.com: gate the sanity check, per Johannes] Link: https://lkml.kernel.org/r/20260316223113.20097-1-byungchul@sk.com Link: https://lkml.kernel.org/r/20260224051347.19621-1-byungchul@sk.com Co-developed-by: Pavel Begunkov Signed-off-by: Pavel Begunkov Signed-off-by: Byungchul Park Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Acked-by: Zi Yan Acked-by: Vlastimil Babka Reviewed-by: Toke Høiland-Jørgensen Acked-by: Mike Rapoport (Microsoft) Acked-by: Johannes Weiner Acked-by: Jakub Kicinski Acked-by: Jesper Dangaard Brouer Acked-by: Ilias Apalodimas Cc: Alexei Starovoitov Cc: Andrew Lunn Cc: Baolin Wang Cc: Brendan Jackman Cc: Christian Brauner Cc: Daniel Borkmann Cc: David S. Miller Cc: David Wei Cc: Dragos Tatulea Cc: Eric Dumazet Cc: John Fastabend Cc: Leon Romanovsky Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Mark Bloch Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mina Almasry Cc: Paolo Abeni Cc: Saeed Mahameed Cc: Simon Horman Cc: Stanislav Fomichev Cc: Stehen Rothwell Cc: Suren Baghdasaryan Cc: Taehee Yoo Cc: Tariq Toukan Cc: Usama Arif Cc: Yu Zhao Signed-off-by: Andrew Morton --- drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c | 2 +- include/linux/mm.h | 27 +++--------------------- include/linux/page-flags.h | 6 ++++++ include/net/netmem.h | 15 +++++++++++-- mm/page_alloc.c | 13 ++++++++---- net/core/netmem_priv.h | 23 +++++++++----------- net/core/page_pool.c | 24 +++++++++++++++++++-- 7 files changed, 64 insertions(+), 46 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index 80f9fc10877a..7d90d2485c78 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -707,7 +707,7 @@ static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq, xdpi = mlx5e_xdpi_fifo_pop(xdpi_fifo); page = xdpi.page.page; - /* No need to check page_pool_page_is_pp() as we + /* No need to check PageNetpp() as we * know this is a page_pool page. */ page_pool_recycle_direct(pp_page_to_nmdesc(page)->pp, diff --git a/include/linux/mm.h b/include/linux/mm.h index 63d1f619260e..c758f4e68727 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4840,10 +4840,9 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); * DMA mapping IDs for page_pool * * When DMA-mapping a page, page_pool allocates an ID (from an xarray) and - * stashes it in the upper bits of page->pp_magic. We always want to be able to - * unambiguously identify page pool pages (using page_pool_page_is_pp()). Non-PP - * pages can have arbitrary kernel pointers stored in the same field as pp_magic - * (since it overlaps with page->lru.next), so we must ensure that we cannot + * stashes it in the upper bits of page->pp_magic. Non-PP pages can have + * arbitrary kernel pointers stored in the same field as pp_magic (since + * it overlaps with page->lru.next), so we must ensure that we cannot * mistake a valid kernel pointer with any of the values we write into this * field. * @@ -4878,26 +4877,6 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); #define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \ PP_DMA_INDEX_SHIFT) -/* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is - * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for - * the head page of compound page and bit 1 for pfmemalloc page, as well as the - * bits used for the DMA index. page_is_pfmemalloc() is checked in - * __page_pool_put_page() to avoid recycling the pfmemalloc page. - */ -#define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL) - -#ifdef CONFIG_PAGE_POOL -static inline bool page_pool_page_is_pp(const struct page *page) -{ - return (page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE; -} -#else -static inline bool page_pool_page_is_pp(const struct page *page) -{ - return false; -} -#endif - #define PAGE_SNAPSHOT_FAITHFUL (1 << 0) #define PAGE_SNAPSHOT_PG_BUDDY (1 << 1) #define PAGE_SNAPSHOT_PG_IDLE (1 << 2) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 7223f6f4e2b4..0e03d816e8b9 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -923,6 +923,7 @@ enum pagetype { PGTY_zsmalloc = 0xf6, PGTY_unaccepted = 0xf7, PGTY_large_kmalloc = 0xf8, + PGTY_netpp = 0xf9, PGTY_mapcount_underflow = 0xff }; @@ -1055,6 +1056,11 @@ PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc) PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted) PAGE_TYPE_OPS(LargeKmalloc, large_kmalloc, large_kmalloc) +/* + * Marks page_pool allocated pages. + */ +PAGE_TYPE_OPS(Netpp, netpp, netpp) + /** * PageHuge - Determine if the page belongs to hugetlbfs * @page: The page to test. diff --git a/include/net/netmem.h b/include/net/netmem.h index a96b3e5e5574..85e3b26ec547 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -110,10 +110,21 @@ struct net_iov { atomic_long_t pp_ref_count; }; }; - struct net_iov_area *owner; + + unsigned int page_type; enum net_iov_type type; + struct net_iov_area *owner; }; +/* Make sure 'the offset of page_type in struct page == the offset of + * type in struct net_iov'. + */ +#define NET_IOV_ASSERT_OFFSET(pg, iov) \ + static_assert(offsetof(struct page, pg) == \ + offsetof(struct net_iov, iov)) +NET_IOV_ASSERT_OFFSET(page_type, page_type); +#undef NET_IOV_ASSERT_OFFSET + struct net_iov_area { /* Array of net_iovs for this area. */ struct net_iov *niovs; @@ -256,7 +267,7 @@ static inline unsigned long netmem_pfn_trace(netmem_ref netmem) */ #define pp_page_to_nmdesc(p) \ ({ \ - DEBUG_NET_WARN_ON_ONCE(!page_pool_page_is_pp(p)); \ + DEBUG_NET_WARN_ON_ONCE(!PageNetpp(p)); \ __pp_page_to_nmdesc(p); \ }) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f11f38ba2e12..fdcc2fde565b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1043,7 +1043,6 @@ static inline bool page_expected_state(struct page *page, #ifdef CONFIG_MEMCG page->memcg_data | #endif - page_pool_page_is_pp(page) | (page->flags.f & check_flags))) return false; @@ -1070,8 +1069,6 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) if (unlikely(page->memcg_data)) bad_reason = "page still charged to cgroup"; #endif - if (unlikely(page_pool_page_is_pp(page))) - bad_reason = "page_pool leak"; return bad_reason; } @@ -1380,9 +1377,17 @@ __always_inline bool __free_pages_prepare(struct page *page, mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); folio->mapping = NULL; } - if (unlikely(page_has_type(page))) + if (unlikely(page_has_type(page))) { + /* networking expects to clear its page type before releasing */ + if (is_check_pages_enabled()) { + if (unlikely(PageNetpp(page))) { + bad_page(page, "page_pool leak"); + return false; + } + } /* Reset the page_type (which overlays _mapcount) */ page->page_type = UINT_MAX; + } if (is_check_pages_enabled()) { if (free_page_is_bad(page)) diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h index 23175cb2bd86..3e6fde8f1726 100644 --- a/net/core/netmem_priv.h +++ b/net/core/netmem_priv.h @@ -8,21 +8,18 @@ static inline unsigned long netmem_get_pp_magic(netmem_ref netmem) return netmem_to_nmdesc(netmem)->pp_magic & ~PP_DMA_INDEX_MASK; } -static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) -{ - netmem_to_nmdesc(netmem)->pp_magic |= pp_magic; -} - -static inline void netmem_clear_pp_magic(netmem_ref netmem) -{ - WARN_ON_ONCE(netmem_to_nmdesc(netmem)->pp_magic & PP_DMA_INDEX_MASK); - - netmem_to_nmdesc(netmem)->pp_magic = 0; -} - static inline bool netmem_is_pp(netmem_ref netmem) { - return (netmem_get_pp_magic(netmem) & PP_MAGIC_MASK) == PP_SIGNATURE; + struct page *page; + + /* XXX: Now that the offset of page_type is shared between + * struct page and net_iov, just cast the netmem to struct page + * unconditionally by clearing NET_IOV if any, no matter whether + * it comes from struct net_iov or struct page. This should be + * adjusted once the offset is no longer shared. + */ + page = (struct page *)((__force unsigned long)netmem & ~NET_IOV); + return PageNetpp(page); } static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 265a729431bb..877bbf7a1938 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -702,8 +702,18 @@ s32 page_pool_inflight(const struct page_pool *pool, bool strict) void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) { + struct page *page; + netmem_set_pp(netmem, pool); - netmem_or_pp_magic(netmem, PP_SIGNATURE); + + /* XXX: Now that the offset of page_type is shared between + * struct page and net_iov, just cast the netmem to struct page + * unconditionally by clearing NET_IOV if any, no matter whether + * it comes from struct net_iov or struct page. This should be + * adjusted once the offset is no longer shared. + */ + page = (struct page *)((__force unsigned long)netmem & ~NET_IOV); + __SetPageNetpp(page); /* Ensuring all pages have been split into one fragment initially: * page_pool_set_pp_info() is only called once for every page when it @@ -718,7 +728,17 @@ void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) void page_pool_clear_pp_info(netmem_ref netmem) { - netmem_clear_pp_magic(netmem); + struct page *page; + + /* XXX: Now that the offset of page_type is shared between + * struct page and net_iov, just cast the netmem to struct page + * unconditionally by clearing NET_IOV if any, no matter whether + * it comes from struct net_iov or struct page. This should be + * adjusted once the offset is no longer shared. + */ + page = (struct page *)((__force unsigned long)netmem & ~NET_IOV); + __ClearPageNetpp(page); + netmem_set_pp(netmem, NULL); } -- cgit v1.2.3 From 3802e1d98e92ca6abdd25446b802f405fef83da0 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 7 Mar 2026 11:53:52 -0800 Subject: mm/damon: document non-zero length damon_region assumption DAMON regions are assumed to always be non-zero length. There was a confusion [1] about it, probably due to lack of the documentation. Document it. Link: https://lkml.kernel.org/r/20260307195356.203753-5-sj@kernel.org Link: https://lore.kernel.org/20251231070029.79682-1-sj@kernel.org/ [1] Signed-off-by: SeongJae Park Acked-by: wang lian Cc: Brendan Higgins Cc: David Gow Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 60e6da3012fa..7d0265d02954 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -55,6 +55,8 @@ struct damon_size_range { * @list: List head for siblings. * @age: Age of this region. * + * For any use case, @ar should be non-zero positive size. + * * @nr_accesses is reset to zero for every &damon_attrs->aggr_interval and be * increased for every &damon_attrs->sample_interval if an access to the region * during the last sampling interval is found. The update of this field should -- cgit v1.2.3 From 341ffe82a7a3a1e0756b58999405b6df0c2b3e8d Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Mon, 9 Mar 2026 16:18:58 +0100 Subject: mm: move vma_kernel_pagesize() from hugetlb to mm.h Patch series "mm: move vma_(kernel|mmu)_pagesize() out of hugetlb.c", v2. Looking into vma_(kernel|mmu)_pagesize(), I realized that there is one scenario where DAX would not do the right thing when the kernel is not compiled with hugetlb support. Without hugetlb support, vma_(kernel|mmu)_pagesize() will always return PAGE_SIZE instead of using the ->pagesize() result provided by dax-device code. Fix that by moving vma_kernel_pagesize() to core MM code, where it belongs. I don't think this is stable material, but am not 100% sure. Also, move vma_mmu_pagesize() while at it. Remove the unnecessary hugetlb.h inclusion from KVM code. This patch (of 4): In the past, only hugetlb had special "vma_kernel_pagesize()" requirements, so it provided its own implementation. In commit 05ea88608d4e ("mm, hugetlbfs: introduce ->pagesize() to vm_operations_struct") we generalized that approach by providing a vm_ops->pagesize() callback to be used by device-dax. Once device-dax started using that callback in commit c1d53b92b95c ("device-dax: implement ->pagesize() for smaps to report MMUPageSize") it was missed that CONFIG_DEV_DAX does not depend on hugetlb support. So building a kernel with CONFIG_DEV_DAX but without CONFIG_HUGETLBFS would not pick up that value. Fix it by moving vma_kernel_pagesize() to mm.h, providing only a single implementation. While at it, improve the kerneldoc a bit. Ideally, we'd move vma_mmu_pagesize() as well to the header. However, its __weak symbol might be overwritten by a PPC variant in hugetlb code. So let's leave it in there for now, as it really only matters for some hugetlb oddities. This was found by code inspection. Link: https://lkml.kernel.org/r/20260309151901.123947-1-david@kernel.org Link: https://lkml.kernel.org/r/20260309151901.123947-2-david@kernel.org Fixes: c1d53b92b95c ("device-dax: implement ->pagesize() for smaps to report MMUPageSize") Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: Mike Rapoport (Microsoft) Cc: Dan Williams Cc: "Christophe Leroy (CS GROUP)" Cc: Jann Horn Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Paolo Bonzini Cc: Pedro Falcato Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 7 ------- include/linux/mm.h | 20 ++++++++++++++++++++ mm/hugetlb.c | 17 ----------------- 3 files changed, 20 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 65910437be1c..44c1848a2c21 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -777,8 +777,6 @@ static inline unsigned long huge_page_size(const struct hstate *h) return (unsigned long)PAGE_SIZE << h->order; } -extern unsigned long vma_kernel_pagesize(struct vm_area_struct *vma); - extern unsigned long vma_mmu_pagesize(struct vm_area_struct *vma); static inline unsigned long huge_page_mask(struct hstate *h) @@ -1177,11 +1175,6 @@ static inline unsigned long huge_page_mask(struct hstate *h) return PAGE_MASK; } -static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) -{ - return PAGE_SIZE; -} - static inline unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) { return PAGE_SIZE; diff --git a/include/linux/mm.h b/include/linux/mm.h index c758f4e68727..e62cea754b0e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1351,6 +1351,26 @@ static inline bool vma_is_shared_maywrite(const struct vm_area_struct *vma) return is_shared_maywrite(&vma->flags); } +/** + * vma_kernel_pagesize - Default page size granularity for this VMA. + * @vma: The user mapping. + * + * The kernel page size specifies in which granularity VMA modifications + * can be performed. Folios in this VMA will be aligned to, and at least + * the size of the number of bytes returned by this function. + * + * The default kernel page size is not affected by Transparent Huge Pages + * being in effect. + * + * Return: The default page size granularity for this VMA. + */ +static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) +{ + if (unlikely(vma->vm_ops && vma->vm_ops->pagesize)) + return vma->vm_ops->pagesize(vma); + return PAGE_SIZE; +} + static inline struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 992c1632d26a..66761ae5ce71 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1017,23 +1017,6 @@ static pgoff_t vma_hugecache_offset(struct hstate *h, (vma->vm_pgoff >> huge_page_order(h)); } -/** - * vma_kernel_pagesize - Page size granularity for this VMA. - * @vma: The user mapping. - * - * Folios in this VMA will be aligned to, and at least the size of the - * number of bytes returned by this function. - * - * Return: The default size of the folios allocated when backing a VMA. - */ -unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) -{ - if (vma->vm_ops && vma->vm_ops->pagesize) - return vma->vm_ops->pagesize(vma); - return PAGE_SIZE; -} -EXPORT_SYMBOL_GPL(vma_kernel_pagesize); - /* * Return the page size being used by the MMU to back a VMA. In the majority * of cases, the page size used by the kernel matches the MMU size. On -- cgit v1.2.3 From a9496e9e4b7c5785e82000a26b1118b4a1fd85c7 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Mon, 9 Mar 2026 16:18:59 +0100 Subject: mm: move vma_mmu_pagesize() from hugetlb to vma.c vma_mmu_pagesize() is also queried on non-hugetlb VMAs and does not really belong into hugetlb.c. PPC64 provides a custom overwrite with CONFIG_HUGETLB_PAGE, see arch/powerpc/mm/book3s64/slice.c, so we cannot easily make this a static inline function. So let's move it to vma.c and add some proper kerneldoc. To make vma tests happy, add a simple vma_kernel_pagesize() stub in tools/testing/vma/include/custom.h. Link: https://lkml.kernel.org/r/20260309151901.123947-3-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: Mike Rapoport (Microsoft) Cc: "Christophe Leroy (CS GROUP)" Cc: Dan Williams Cc: Jann Horn Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Paolo Bonzini Cc: Pedro Falcato Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 7 ------- include/linux/mm.h | 2 ++ mm/hugetlb.c | 11 ----------- mm/vma.c | 21 +++++++++++++++++++++ tools/testing/vma/include/custom.h | 5 +++++ 5 files changed, 28 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 44c1848a2c21..aaf3d472e6b5 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -777,8 +777,6 @@ static inline unsigned long huge_page_size(const struct hstate *h) return (unsigned long)PAGE_SIZE << h->order; } -extern unsigned long vma_mmu_pagesize(struct vm_area_struct *vma); - static inline unsigned long huge_page_mask(struct hstate *h) { return h->mask; @@ -1175,11 +1173,6 @@ static inline unsigned long huge_page_mask(struct hstate *h) return PAGE_MASK; } -static inline unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) -{ - return PAGE_SIZE; -} - static inline unsigned int huge_page_order(struct hstate *h) { return 0; diff --git a/include/linux/mm.h b/include/linux/mm.h index e62cea754b0e..efb8be5d259c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1371,6 +1371,8 @@ static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) return PAGE_SIZE; } +unsigned long vma_mmu_pagesize(struct vm_area_struct *vma); + static inline struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 66761ae5ce71..a786034ac95c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1017,17 +1017,6 @@ static pgoff_t vma_hugecache_offset(struct hstate *h, (vma->vm_pgoff >> huge_page_order(h)); } -/* - * Return the page size being used by the MMU to back a VMA. In the majority - * of cases, the page size used by the kernel matches the MMU size. On - * architectures where it differs, an architecture-specific 'strong' - * version of this symbol is required. - */ -__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) -{ - return vma_kernel_pagesize(vma); -} - /* * Flags for MAP_PRIVATE reservations. These are stored in the bottom * bits of the reservation map pointer, which are always clear due to diff --git a/mm/vma.c b/mm/vma.c index be64f781a3aa..e95fd5a5fe5c 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -3300,3 +3300,24 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) return 0; } + +/** + * vma_mmu_pagesize - Default MMU page size granularity for this VMA. + * @vma: The user mapping. + * + * In the common case, the default page size used by the MMU matches the + * default page size used by the kernel (see vma_kernel_pagesize()). On + * architectures where it differs, an architecture-specific 'strong' version + * of this symbol is required. + * + * The default MMU page size is not affected by Transparent Huge Pages + * being in effect, or any usage of larger MMU page sizes (either through + * architectural huge-page mappings or other explicit/implicit coalescing of + * virtual ranges performed by the MMU). + * + * Return: The default MMU page size granularity for this VMA. + */ +__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) +{ + return vma_kernel_pagesize(vma); +} diff --git a/tools/testing/vma/include/custom.h b/tools/testing/vma/include/custom.h index 833ff4d7f799..7150e09122b2 100644 --- a/tools/testing/vma/include/custom.h +++ b/tools/testing/vma/include/custom.h @@ -118,3 +118,8 @@ static __always_inline vma_flags_t __mk_vma_flags(size_t count, vma_flags_set_flag(&flags, bits[i]); return flags; } + +static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) +{ + return PAGE_SIZE; +} -- cgit v1.2.3 From d239462787b072c78eb19fc1f155c3d411256282 Mon Sep 17 00:00:00 2001 From: Anthony Yznaga Date: Tue, 10 Mar 2026 08:58:20 -0700 Subject: mm: prevent droppable mappings from being locked Droppable mappings must not be lockable. There is a check for VMAs with VM_DROPPABLE set in mlock_fixup() along with checks for other types of unlockable VMAs which ensures this when calling mlock()/mlock2(). For mlockall(MCL_FUTURE), the check for unlockable VMAs is different. In apply_mlockall_flags(), if the flags parameter has MCL_FUTURE set, the current task's mm's default VMA flag field mm->def_flags has VM_LOCKED applied to it. VM_LOCKONFAULT is also applied if MCL_ONFAULT is also set. When these flags are set as default in this manner they are cleared in __mmap_complete() for new mappings that do not support mlock. A check for VM_DROPPABLE in __mmap_complete() is missing resulting in droppable mappings created with VM_LOCKED set. To fix this and reduce that chance of similar bugs in the future, introduce and use vma_supports_mlock(). Link: https://lkml.kernel.org/r/20260310155821.17869-1-anthony.yznaga@oracle.com Fixes: 9651fcedf7b9 ("mm: add MAP_DROPPABLE for designating always lazily freeable mappings") Signed-off-by: Anthony Yznaga Suggested-by: David Hildenbrand Acked-by: David Hildenbrand (Arm) Reviewed-by: Pedro Falcato Reviewed-by: Lorenzo Stoakes (Oracle) Tested-by: Lorenzo Stoakes (Oracle) Cc: Jann Horn Cc: Jason A. Donenfeld Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/hugetlb_inline.h | 2 +- mm/internal.h | 10 ++++++++++ mm/mlock.c | 10 ++++++---- mm/vma.c | 4 +--- tools/testing/vma/include/stubs.h | 5 +++++ 5 files changed, 23 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h index 84afc3c3e2e4..565b473fd135 100644 --- a/include/linux/hugetlb_inline.h +++ b/include/linux/hugetlb_inline.h @@ -30,7 +30,7 @@ static inline bool is_vma_hugetlb_flags(const vma_flags_t *flags) #endif -static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) +static inline bool is_vm_hugetlb_page(const struct vm_area_struct *vma) { return is_vm_hugetlb_flags(vma->vm_flags); } diff --git a/mm/internal.h b/mm/internal.h index 4ab833b8bcdf..ebb68ad10d5c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1243,6 +1243,16 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, } return fpin; } + +static inline bool vma_supports_mlock(const struct vm_area_struct *vma) +{ + if (vma->vm_flags & (VM_SPECIAL | VM_DROPPABLE)) + return false; + if (vma_is_dax(vma) || is_vm_hugetlb_page(vma)) + return false; + return vma != get_gate_vma(current->mm); +} + #else /* !CONFIG_MMU */ static inline void unmap_mapping_folio(struct folio *folio) { } static inline void mlock_new_folio(struct folio *folio) { } diff --git a/mm/mlock.c b/mm/mlock.c index 1a92d16f3684..fd648138bc72 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -472,10 +472,12 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, int ret = 0; vm_flags_t oldflags = vma->vm_flags; - if (newflags == oldflags || (oldflags & VM_SPECIAL) || - is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) || - vma_is_dax(vma) || vma_is_secretmem(vma) || (oldflags & VM_DROPPABLE)) - /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ + if (newflags == oldflags || vma_is_secretmem(vma) || + !vma_supports_mlock(vma)) + /* + * Don't set VM_LOCKED or VM_LOCKONFAULT and don't count. + * For secretmem, don't allow the memory to be unlocked. + */ goto out; vma = vma_modify_flags(vmi, *prev, vma, start, end, &newflags); diff --git a/mm/vma.c b/mm/vma.c index e95fd5a5fe5c..b7055c264b5d 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -2589,9 +2589,7 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) vm_stat_account(mm, vma->vm_flags, map->pglen); if (vm_flags & VM_LOCKED) { - if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || - is_vm_hugetlb_page(vma) || - vma == get_gate_vma(mm)) + if (!vma_supports_mlock(vma)) vm_flags_clear(vma, VM_LOCKED_MASK); else mm->locked_vm += map->pglen; diff --git a/tools/testing/vma/include/stubs.h b/tools/testing/vma/include/stubs.h index 947a3a0c2566..416bb93f5005 100644 --- a/tools/testing/vma/include/stubs.h +++ b/tools/testing/vma/include/stubs.h @@ -426,3 +426,8 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, } static inline void hugetlb_split(struct vm_area_struct *, unsigned long) {} + +static inline bool vma_supports_mlock(const struct vm_area_struct *vma) +{ + return false; +} -- cgit v1.2.3 From 8719c59c4b928fc9ad8d8f45ecbdf859660c904c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 Mar 2026 18:05:17 -0700 Subject: mm/damon/core: introduce damos_quota_goal_tuner Patch series "mm/damon: support multiple goal-based quota tuning algorithms". Aim-oriented DAMOS quota auto-tuning uses a single tuning algorithm. The algorithm is designed to find a quota value that should be consistently kept for achieving the aimed goal for long term. It is useful and reliable at automatically operating systems that have dynamic environments in the long term. As always, however, no single algorithm fits all. When the environment has static characteristics or there are control towers in not only the kernel space but also the user space, the algorithm shows some limitations. In such environments, users want kernel work in a more short term deterministic way. Actually there were at least two reports [1,2] of such cases. Extend DAMOS quotas goal to support multiple quota tuning algorithms that users can select. Keep the current algorithm as the default one, to not break the old users. Also give it a name, "consist", as it is designed to "consistently" apply the DAMOS action. And introduce a new tuning algorithm, namely "temporal". It is designed to apply the DAMOS action only temporally, in a deterministic way. In more detail, as long as the goal is under-achieved, it uses the maximum quota available. Once the goal is over-achieved, it sets the quota zero. Tests ===== I confirmed the feature is working as expected using the latest version of DAMON user-space tool, like below. $ # start DAMOS for reclaiming memory aiming 30% free memory $ sudo ./damo/damo start --damos_action pageout \ --damos_quota_goal_tuner temporal \ --damos_quota_goal node_mem_free_bp 30% 0 \ --damos_quota_interval 1s \ --damos_quota_space 100M Note that >=3.1.8 version of DAMON user-space tool supports this feature (--damos_quota_goal_tuner). As expected, DAMOS stops reclaiming memory as soon as the goal amount of free memory is made. When 'consist' tuner is used, the reclamation was continued even after the goal amount of free memory is made, resulting in more than goal amount of free memory, as expected. Patch Sequence ============== First four patches implement the features. Patch 1 extends core API to allow multiple tuners and make the current tuner as the default and only available tuner, namely 'consist'. Patch 2 allows future tuners setting zero effective quota. Patch 3 introduces the second tuner, namely 'temporal'. Patch 4 further extends DAMON sysfs API to let users use that. Three following patches (patches 5-7) update design, usage, and ABI documents, respectively. Final four patches (patches 8-11) are for adding tests. The eighth patch (patch 8) extends the kunit test for online parameters commit for validating the goal_tuner. The ninth and the tenth patches (patches 9-10) extend the testing-purpose DAMON sysfs control helper and DAMON status dumping tool to support the newly added feature. The final eleventh one (patch 11) extends the existing online commit selftest to cover the new feature. This patch (of 11): DAMOS quota goal feature utilizes a single feedback loop based algorithm for automatic tuning of the effective quota. It is useful in dynamic environments that operate systems with only kernels in the long term. But, no one fits all. It is not very easy to control in environments having more controlled characteristics and user-space control towers. We actually got multiple reports [1,2] of use cases that the algorithm is not optimal. Introduce a new field of 'struct damos_quotas', namely 'goal_tuner'. It specifies what tuning algorithm the given scheme should use, and allows DAMON API callers to set it as they want. Nonetheless, this commit introduces no new tuning algorithm but only the interface. This commit hence makes no behavioral change. A new algorithm will be added by the following commit. Link: https://lkml.kernel.org/r/20260310010529.91162-2-sj@kernel.org Link: https://lore.kernel.org/CALa+Y17__d=ZsM1yX+MXx0ozVdsXnFqF4p0g+kATEitrWyZFfg@mail.gmail.com [1] Link: https://lore.kernel.org/20260204022537.814-1-yunjeong.mun@sk.com [2] Signed-off-by: SeongJae Park Cc: Shuah Khan Cc: Brendan Higgins Cc: David Gow Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/damon.h | 10 ++++++++++ mm/damon/core.c | 1 + 2 files changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 7d0265d02954..24de35a8395a 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -215,12 +215,21 @@ struct damos_quota_goal { struct list_head list; }; +/** + * enum damos_quota_goal_tuner - Goal-based quota tuning logic. + * @DAMOS_QUOTA_GOAL_TUNER_CONSIST: Aim long term consistent quota. + */ +enum damos_quota_goal_tuner { + DAMOS_QUOTA_GOAL_TUNER_CONSIST, +}; + /** * struct damos_quota - Controls the aggressiveness of the given scheme. * @reset_interval: Charge reset interval in milliseconds. * @ms: Maximum milliseconds that the scheme can use. * @sz: Maximum bytes of memory that the action can be applied. * @goals: Head of quota tuning goals (&damos_quota_goal) list. + * @goal_tuner: Goal-based @esz tuning algorithm to use. * @esz: Effective size quota in bytes. * * @weight_sz: Weight of the region's size for prioritization. @@ -262,6 +271,7 @@ struct damos_quota { unsigned long ms; unsigned long sz; struct list_head goals; + enum damos_quota_goal_tuner goal_tuner; unsigned long esz; unsigned int weight_sz; diff --git a/mm/damon/core.c b/mm/damon/core.c index 2d2332f3d377..16905bf35c40 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -912,6 +912,7 @@ static int damos_commit_quota(struct damos_quota *dst, struct damos_quota *src) err = damos_commit_quota_goals(dst, src); if (err) return err; + dst->goal_tuner = src->goal_tuner; dst->weight_sz = src->weight_sz; dst->weight_nr_accesses = src->weight_nr_accesses; dst->weight_age = src->weight_age; -- cgit v1.2.3 From af738a6a00c1febb0d543ba6a1400413f824ecf1 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 Mar 2026 18:05:19 -0700 Subject: mm/damon/core: introduce DAMOS_QUOTA_GOAL_TUNER_TEMPORAL Introduce a new goal-based DAMOS quota auto-tuning algorithm, namely DAMOS_QUOTA_GOAL_TUNER_TEMPORAL (temporal in short). The algorithm aims to trigger the DAMOS action only for a temporal time, to achieve the goal as soon as possible. For the temporal period, it uses as much quota as allowed. Once the goal is achieved, it sets the quota zero, so effectively makes the scheme be deactivated. Link: https://lkml.kernel.org/r/20260310010529.91162-4-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ mm/damon/core.c | 29 ++++++++++++++++++++++++----- 2 files changed, 26 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 24de35a8395a..e44e2132ccaf 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -218,9 +218,11 @@ struct damos_quota_goal { /** * enum damos_quota_goal_tuner - Goal-based quota tuning logic. * @DAMOS_QUOTA_GOAL_TUNER_CONSIST: Aim long term consistent quota. + * @DAMOS_QUOTA_GOAL_TUNER_TEMPORAL: Aim zero quota asap. */ enum damos_quota_goal_tuner { DAMOS_QUOTA_GOAL_TUNER_CONSIST, + DAMOS_QUOTA_GOAL_TUNER_TEMPORAL, }; /** diff --git a/mm/damon/core.c b/mm/damon/core.c index db3c59b70e49..b543d1202c9d 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2347,6 +2347,26 @@ static unsigned long damos_quota_score(struct damos_quota *quota) return highest_score; } +static void damos_goal_tune_esz_bp_consist(struct damos_quota *quota) +{ + unsigned long score = damos_quota_score(quota); + + quota->esz_bp = damon_feed_loop_next_input( + max(quota->esz_bp, 10000UL), score); +} + +static void damos_goal_tune_esz_bp_temporal(struct damos_quota *quota) +{ + unsigned long score = damos_quota_score(quota); + + if (score >= 10000) + quota->esz_bp = 0; + else if (quota->sz) + quota->esz_bp = quota->sz * 10000; + else + quota->esz_bp = ULONG_MAX; +} + /* * Called only if quota->ms, or quota->sz are set, or quota->goals is not empty */ @@ -2361,11 +2381,10 @@ static void damos_set_effective_quota(struct damos_quota *quota) } if (!list_empty("a->goals)) { - unsigned long score = damos_quota_score(quota); - - quota->esz_bp = damon_feed_loop_next_input( - max(quota->esz_bp, 10000UL), - score); + if (quota->goal_tuner == DAMOS_QUOTA_GOAL_TUNER_CONSIST) + damos_goal_tune_esz_bp_consist(quota); + else if (quota->goal_tuner == DAMOS_QUOTA_GOAL_TUNER_TEMPORAL) + damos_goal_tune_esz_bp_temporal(quota); esz = quota->esz_bp / 10000; } -- cgit v1.2.3 From d4e981b280454f4368950db6269c6077d66453cf Mon Sep 17 00:00:00 2001 From: Kexin Sun Date: Thu, 12 Mar 2026 13:38:12 +0800 Subject: kasan: update outdated comment kmalloc_large() was renamed kmalloc_large_noprof() by commit 7bd230a26648 ("mm/slab: enable slab allocation tagging for kmalloc and friends"), and subsequently renamed __kmalloc_large_noprof() by commit a0a44d9175b3 ("mm, slab: don't wrap internal functions with alloc_hooks()"), making it an internal implementation detail. Large kmalloc allocations are now performed through the public kmalloc() interface directly, making the reference to KMALLOC_MAX_SIZE also stale (KMALLOC_MAX_CACHE_SIZE would be more accurate). Remove the references to kmalloc_large() and KMALLOC_MAX_SIZE, and rephrase the description for large kmalloc allocations. Link: https://lkml.kernel.org/r/20260312053812.1365-1-kexinsun@smail.nju.edu.cn Signed-off-by: Kexin Sun Suggested-by: Andrey Ryabinin Assisted-by: unnamed:deepseek-v3.2 coccinelle Reviewed-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Julia Lawall Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- include/linux/kasan.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 338a1921a50a..bf233bde68c7 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -352,8 +352,8 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip); * kasan_mempool_unpoison_object(). * * This function operates on all slab allocations including large kmalloc - * allocations (the ones returned by kmalloc_large() or by kmalloc() with the - * size > KMALLOC_MAX_SIZE). + * allocations (i.e. the ones backed directly by the buddy allocator rather + * than kmalloc slab caches). * * Return: true if the allocation can be safely reused; false otherwise. */ @@ -381,8 +381,8 @@ void __kasan_mempool_unpoison_object(void *ptr, size_t size, unsigned long ip); * original tags based on the pointer value. * * This function operates on all slab allocations including large kmalloc - * allocations (the ones returned by kmalloc_large() or by kmalloc() with the - * size > KMALLOC_MAX_SIZE). + * allocations (i.e. the ones backed directly by the buddy allocator rather + * than kmalloc slab caches). */ static __always_inline void kasan_mempool_unpoison_object(void *ptr, size_t size) -- cgit v1.2.3 From 2d1e54aab6fd01f7502af20e125312e06a15bf9c Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Wed, 11 Mar 2026 17:24:37 +0000 Subject: mm: abstract reading sysctl_max_map_count, and READ_ONCE() Concurrent reads and writes of sysctl_max_map_count are possible, so we should READ_ONCE() and WRITE_ONCE(). The sysctl procfs logic already enforces WRITE_ONCE(), so abstract the read side with get_sysctl_max_map_count(). While we're here, also move the field to mm/internal.h and add the getter there since only mm interacts with it, there's no need for anybody else to have access. Finally, update the VMA userland tests to reflect the change. Link: https://lkml.kernel.org/r/0715259eb37cbdfde4f9e5db92a20ec7110a1ce5.1773249037.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Pedro Falcato Cc: Jann Horn Cc: Jianzhou Zhao Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 -- mm/internal.h | 6 ++++++ mm/mmap.c | 2 +- mm/mremap.c | 4 ++-- mm/nommu.c | 2 +- mm/vma.c | 6 +++--- tools/testing/vma/include/custom.h | 3 --- tools/testing/vma/include/dup.h | 9 +++++++++ tools/testing/vma/main.c | 2 ++ 9 files changed, 24 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index efb8be5d259c..25ba5816e02b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -207,8 +207,6 @@ static inline void __mm_zero_struct_page(struct page *page) #define MAPCOUNT_ELF_CORE_MARGIN (5) #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) -extern int sysctl_max_map_count; - extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; diff --git a/mm/internal.h b/mm/internal.h index f50a0376b87e..62d80fd37ae1 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1863,4 +1863,10 @@ static inline int pmdp_test_and_clear_young_notify(struct vm_area_struct *vma, #endif /* CONFIG_MMU_NOTIFIER */ +extern int sysctl_max_map_count; +static inline int get_sysctl_max_map_count(void) +{ + return READ_ONCE(sysctl_max_map_count); +} + #endif /* __MM_INTERNAL_H */ diff --git a/mm/mmap.c b/mm/mmap.c index 843160946aa5..79544d893411 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -375,7 +375,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, return -EOVERFLOW; /* Too many mappings? */ - if (mm->map_count > sysctl_max_map_count) + if (mm->map_count > get_sysctl_max_map_count()) return -ENOMEM; /* diff --git a/mm/mremap.c b/mm/mremap.c index e8c3021dd841..ba6c690f6c1b 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1045,7 +1045,7 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm) * which may not merge, then (if MREMAP_DONTUNMAP is not set) unmap the * source, which may split, causing a net increase of 2 mappings. */ - if (current->mm->map_count + 2 > sysctl_max_map_count) + if (current->mm->map_count + 2 > get_sysctl_max_map_count()) return -ENOMEM; if (vma->vm_ops && vma->vm_ops->may_split) { @@ -1813,7 +1813,7 @@ static unsigned long check_mremap_params(struct vma_remap_struct *vrm) * net increased map count of 2. In move_vma() we check for headroom of * 2 additional mappings, so check early to avoid bailing out then. */ - if (current->mm->map_count + 4 > sysctl_max_map_count) + if (current->mm->map_count + 4 > get_sysctl_max_map_count()) return -ENOMEM; return 0; diff --git a/mm/nommu.c b/mm/nommu.c index c3a23b082adb..ed3934bc2de4 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1317,7 +1317,7 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, return -ENOMEM; mm = vma->vm_mm; - if (mm->map_count >= sysctl_max_map_count) + if (mm->map_count >= get_sysctl_max_map_count()) return -ENOMEM; region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL); diff --git a/mm/vma.c b/mm/vma.c index b7055c264b5d..4d21e7d8e93c 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -590,7 +590,7 @@ out_free_vma: static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { - if (vma->vm_mm->map_count >= sysctl_max_map_count) + if (vma->vm_mm->map_count >= get_sysctl_max_map_count()) return -ENOMEM; return __split_vma(vmi, vma, addr, new_below); @@ -1394,7 +1394,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, * its limit temporarily, to help free resources as expected. */ if (vms->end < vms->vma->vm_end && - vms->vma->vm_mm->map_count >= sysctl_max_map_count) { + vms->vma->vm_mm->map_count >= get_sysctl_max_map_count()) { error = -ENOMEM; goto map_count_exceeded; } @@ -2868,7 +2868,7 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) return -ENOMEM; - if (mm->map_count > sysctl_max_map_count) + if (mm->map_count > get_sysctl_max_map_count()) return -ENOMEM; if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) diff --git a/tools/testing/vma/include/custom.h b/tools/testing/vma/include/custom.h index 7150e09122b2..6c62a38a2f6f 100644 --- a/tools/testing/vma/include/custom.h +++ b/tools/testing/vma/include/custom.h @@ -21,9 +21,6 @@ extern unsigned long dac_mmap_min_addr; #define VM_BUG_ON(_expr) (BUG_ON(_expr)) #define VM_BUG_ON_VMA(_expr, _vma) (BUG_ON(_expr)) -/* We hardcode this for now. */ -#define sysctl_max_map_count 0x1000000UL - #define TASK_SIZE ((1ul << 47)-PAGE_SIZE) /* diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 5eb313beb43d..8865ffe046d8 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -419,6 +419,9 @@ struct vma_iterator { #define EMPTY_VMA_FLAGS ((vma_flags_t){ }) +#define MAPCOUNT_ELF_CORE_MARGIN (5) +#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) + /* What action should be taken after an .mmap_prepare call is complete? */ enum mmap_action_type { MMAP_NOTHING, /* Mapping is complete, no further action. */ @@ -1342,3 +1345,9 @@ static inline void vma_set_file(struct vm_area_struct *vma, struct file *file) swap(vma->vm_file, file); fput(file); } + +extern int sysctl_max_map_count; +static inline int get_sysctl_max_map_count(void) +{ + return READ_ONCE(sysctl_max_map_count); +} diff --git a/tools/testing/vma/main.c b/tools/testing/vma/main.c index 49b09e97a51f..18338f5d29e0 100644 --- a/tools/testing/vma/main.c +++ b/tools/testing/vma/main.c @@ -14,6 +14,8 @@ #include "tests/mmap.c" #include "tests/vma.c" +int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; + /* Helper functions which utilise static kernel functions. */ struct vm_area_struct *merge_existing(struct vma_merge_struct *vmg) -- cgit v1.2.3 From eabc2eddb2767e0ed90f98a65744bf4c8e287db7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 10 Mar 2026 22:29:24 -0700 Subject: mm/damon/core: receive addr_unit on damon_set_region_biggest_system_ram_default() damon_find_biggest_system_ram() was not supporting addr_unit in the past. Hence, its caller, damon_set_region_biggest_system_ram_default(), was also not supporting addr_unit. The previous commit has updated the inner function to support addr_unit. There is no more reason to not support addr_unit on damon_set_region_biggest_system_ram_default(). Rather, it makes unnecessary inconsistency on support of addr_unit. Update it to receive addr_unit and handle it inside. Link: https://lkml.kernel.org/r/20260311052927.93921-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Yang yingliang Signed-off-by: Andrew Morton --- include/linux/damon.h | 1 + mm/damon/core.c | 7 ++++--- mm/damon/lru_sort.c | 1 + mm/damon/reclaim.c | 1 + 4 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index e44e2132ccaf..d9a3babbafc1 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -994,6 +994,7 @@ int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control); int damon_set_region_biggest_system_ram_default(struct damon_target *t, unsigned long *start, unsigned long *end, + unsigned long addr_unit, unsigned long min_region_sz); #endif /* CONFIG_DAMON */ diff --git a/mm/damon/core.c b/mm/damon/core.c index f5f46ba5d537..01c892a1dcd2 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -3110,6 +3110,7 @@ static bool damon_find_biggest_system_ram(unsigned long *start, * @t: The monitoring target to set the region. * @start: The pointer to the start address of the region. * @end: The pointer to the end address of the region. + * @addr_unit: The address unit for the damon_ctx of @t. * @min_region_sz: Minimum region size. * * This function sets the region of @t as requested by @start and @end. If the @@ -3122,7 +3123,7 @@ static bool damon_find_biggest_system_ram(unsigned long *start, */ int damon_set_region_biggest_system_ram_default(struct damon_target *t, unsigned long *start, unsigned long *end, - unsigned long min_region_sz) + unsigned long addr_unit, unsigned long min_region_sz) { struct damon_addr_range addr_range; @@ -3130,12 +3131,12 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t, return -EINVAL; if (!*start && !*end && - !damon_find_biggest_system_ram(start, end, 1)) + !damon_find_biggest_system_ram(start, end, addr_unit)) return -EINVAL; addr_range.start = *start; addr_range.end = *end; - return damon_set_regions(t, &addr_range, 1, min_region_sz); + return damon_set_regions(t, &addr_range, addr_unit, min_region_sz); } /* diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 7bc5c0b2aea3..133ea17e258d 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -345,6 +345,7 @@ static int damon_lru_sort_apply_parameters(void) err = damon_set_region_biggest_system_ram_default(param_target, &monitor_region_start, &monitor_region_end, + param_ctx->addr_unit, param_ctx->min_region_sz); if (err) goto out; diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 43d76f5bed44..01f2f6cdbcdf 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -251,6 +251,7 @@ static int damon_reclaim_apply_parameters(void) err = damon_set_region_biggest_system_ram_default(param_target, &monitor_region_start, &monitor_region_end, + param_ctx->addr_unit, param_ctx->min_region_sz); if (err) goto out; -- cgit v1.2.3 From 0217c7fb4de4a40cee667eb21901f3204effe5ac Mon Sep 17 00:00:00 2001 From: Jianhui Zhou Date: Tue, 10 Mar 2026 19:05:26 +0800 Subject: mm/userfaultfd: fix hugetlb fault mutex hash calculation In mfill_atomic_hugetlb(), linear_page_index() is used to calculate the page index for hugetlb_fault_mutex_hash(). However, linear_page_index() returns the index in PAGE_SIZE units, while hugetlb_fault_mutex_hash() expects the index in huge page units. This mismatch means that different addresses within the same huge page can produce different hash values, leading to the use of different mutexes for the same huge page. This can cause races between faulting threads, which can corrupt the reservation map and trigger the BUG_ON in resv_map_release(). Fix this by introducing hugetlb_linear_page_index(), which returns the page index in huge page granularity, and using it in place of linear_page_index(). Link: https://lkml.kernel.org/r/20260310110526.335749-1-jianhuizzzzz@gmail.com Fixes: a08c7193e4f1 ("mm/filemap: remove hugetlb special casing in filemap.c") Signed-off-by: Jianhui Zhou Reported-by: syzbot+f525fd79634858f478e7@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=f525fd79634858f478e7 Acked-by: SeongJae Park Reviewed-by: David Hildenbrand (Arm) Acked-by: Mike Rapoport (Microsoft) Cc: Jane Chu Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: JonasZhou Cc: Muchun Song Cc: Oscar Salvador Cc: Peter Xu Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 17 +++++++++++++++++ mm/userfaultfd.c | 2 +- 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index aaf3d472e6b5..9c098a02a09e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -792,6 +792,23 @@ static inline unsigned huge_page_shift(struct hstate *h) return h->order + PAGE_SHIFT; } +/** + * hugetlb_linear_page_index() - linear_page_index() but in hugetlb + * page size granularity. + * @vma: the hugetlb VMA + * @address: the virtual address within the VMA + * + * Return: the page offset within the mapping in huge page units. + */ +static inline pgoff_t hugetlb_linear_page_index(struct vm_area_struct *vma, + unsigned long address) +{ + struct hstate *h = hstate_vma(vma); + + return ((address - vma->vm_start) >> huge_page_shift(h)) + + (vma->vm_pgoff >> huge_page_order(h)); +} + static inline bool order_is_gigantic(unsigned int order) { return order > MAX_PAGE_ORDER; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index e19872e51878..2c565c7134b6 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -573,7 +573,7 @@ retry: * in the case of shared pmds. fault mutex prevents * races with other faulting threads. */ - idx = linear_page_index(dst_vma, dst_addr); + idx = hugetlb_linear_page_index(dst_vma, dst_addr); mapping = dst_vma->vm_file->f_mapping; hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); -- cgit v1.2.3 From a91fd9f710490a89713823be3e7790ac59a085f8 Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Wed, 25 Mar 2026 05:40:18 -0600 Subject: mm: consolidate anonymous folio PTE mapping into helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: khugepaged cleanups and mTHP prerequisites", v4. The following series contains cleanups and prerequisites for my work on khugepaged mTHP support [1]. These have been separated out to ease review. The first patch in the series refactors the page fault folio to pte mapping and follows a similar convention as defined by map_anon_folio_pmd_(no)pf(). This not only cleans up the current implementation of do_anonymous_page(), but will allow for reuse later in the khugepaged mTHP implementation. The second patch adds a small is_pmd_order() helper to check if an order is the PMD order. This check is open-coded in a number of places. This patch aims to clean this up and will be used more in the khugepaged mTHP work. The third patch also adds a small DEFINE for (HPAGE_PMD_NR - 1) which is used often across the khugepaged code. The fourth and fifth patch come from the khugepaged mTHP patchset [1]. These two patches include the rename of function prefixes, and the unification of khugepaged and madvise_collapse via a new collapse_single_pmd function. Patch 1: refactor do_anonymous_page into map_anon_folio_pte_(no)pf Patch 2: add is_pmd_order helper Patch 3: Add define for (HPAGE_PMD_NR - 1) Patch 4: Refactor/rename hpage_collapse Patch 5: Refactoring to combine madvise_collapse and khugepaged A big thanks to everyone that has reviewed, tested, and participated in the development process. This patch (of 5): The anonymous page fault handler in do_anonymous_page() open-codes the sequence to map a newly allocated anonymous folio at the PTE level: - construct the PTE entry - add rmap - add to LRU - set the PTEs - update the MMU cache. Introduce two helpers to consolidate this duplicated logic, mirroring the existing map_anon_folio_pmd_nopf() pattern for PMD-level mappings: map_anon_folio_pte_nopf(): constructs the PTE entry, takes folio references, adds anon rmap and LRU. This function also handles the uffd_wp that can occur in the pf variant. The future khugepaged mTHP code calls this to handle mapping the new collapsed mTHP to its folio. map_anon_folio_pte_pf(): extends the nopf variant to handle MM_ANONPAGES counter updates, and mTHP fault allocation statistics for the page fault path. The zero-page read path in do_anonymous_page() is also untangled from the shared setpte label, since it does not allocate a folio and should not share the same mapping sequence as the write path. We can now leave nr_pages undeclared at the function intialization, and use the single page update_mmu_cache function to handle the zero page update. This refactoring will also help reduce code duplication between mm/memory.c and mm/khugepaged.c, and provides a clean API for PTE-level anonymous folio mapping that can be reused by future callers (like khugpeaged mTHP support) Link: https://lkml.kernel.org/r/20260325114022.444081-1-npache@redhat.com Link: https://lkml.kernel.org/r/20260325114022.444081-2-npache@redhat.com Link: https://lore.kernel.org/all/20260122192841.128719-1-npache@redhat.com Signed-off-by: Nico Pache Suggested-by: Lorenzo Stoakes (Oracle) Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Dev Jain Reviewed-by: Lance Yang Acked-by: David Hildenbrand (Arm) Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Baolin Wang Cc: Barry Song Cc: Brendan Jackman Cc: Byungchul Park Cc: Catalin Marinas Cc: David Rientjes Cc: Gregory Price Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Joshua Hahn Cc: Kefeng Wang Cc: Liam Howlett Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Matthew Brost Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Nanyong Sun Cc: Pedro Falcato Cc: Peter Xu Cc: Rafael Aquini Cc: Rakie Kim Cc: Randy Dunlap Cc: Ryan Roberts Cc: Shivank Garg Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Takashi Iwai (SUSE) Cc: Thomas Hellström Cc: Usama Arif Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Wei Yang Cc: Will Deacon Cc: Yang Shi Cc: Zach O'Keefe Cc: Zi Yan Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 ++++ mm/memory.c | 61 ++++++++++++++++++++++++++++++++++++------------------ 2 files changed, 45 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 25ba5816e02b..16a1ad9a3397 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4916,4 +4916,8 @@ static inline bool snapshot_page_is_faithful(const struct page_snapshot *ps) void snapshot_page(struct page_snapshot *ps, const struct page *page); +void map_anon_folio_pte_nopf(struct folio *folio, pte_t *pte, + struct vm_area_struct *vma, unsigned long addr, + bool uffd_wp); + #endif /* _LINUX_MM_H */ diff --git a/mm/memory.c b/mm/memory.c index f21c804b50bf..7c350a38fecf 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5197,6 +5197,37 @@ fallback: return folio_prealloc(vma->vm_mm, vma, vmf->address, true); } +void map_anon_folio_pte_nopf(struct folio *folio, pte_t *pte, + struct vm_area_struct *vma, unsigned long addr, + bool uffd_wp) +{ + const unsigned int nr_pages = folio_nr_pages(folio); + pte_t entry = folio_mk_pte(folio, vma->vm_page_prot); + + entry = pte_sw_mkyoung(entry); + + if (vma->vm_flags & VM_WRITE) + entry = pte_mkwrite(pte_mkdirty(entry), vma); + if (uffd_wp) + entry = pte_mkuffd_wp(entry); + + folio_ref_add(folio, nr_pages - 1); + folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); + folio_add_lru_vma(folio, vma); + set_ptes(vma->vm_mm, addr, pte, entry, nr_pages); + update_mmu_cache_range(NULL, vma, addr, pte, nr_pages); +} + +static void map_anon_folio_pte_pf(struct folio *folio, pte_t *pte, + struct vm_area_struct *vma, unsigned long addr, bool uffd_wp) +{ + const unsigned int order = folio_order(folio); + + map_anon_folio_pte_nopf(folio, pte, vma, addr, uffd_wp); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1L << order); + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_ALLOC); +} + /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. @@ -5208,7 +5239,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) unsigned long addr = vmf->address; struct folio *folio; vm_fault_t ret = 0; - int nr_pages = 1; + int nr_pages; pte_t entry; /* File mapping without ->vm_ops ? */ @@ -5243,7 +5274,13 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) pte_unmap_unlock(vmf->pte, vmf->ptl); return handle_userfault(vmf, VM_UFFD_MISSING); } - goto setpte; + if (vmf_orig_pte_uffd_wp(vmf)) + entry = pte_mkuffd_wp(entry); + set_pte_at(vma->vm_mm, addr, vmf->pte, entry); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, addr, vmf->pte); + goto unlock; } /* Allocate our own private page. */ @@ -5267,11 +5304,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) */ __folio_mark_uptodate(folio); - entry = folio_mk_pte(folio, vma->vm_page_prot); - entry = pte_sw_mkyoung(entry); - if (vma->vm_flags & VM_WRITE) - entry = pte_mkwrite(pte_mkdirty(entry), vma); - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); if (!vmf->pte) goto release; @@ -5293,19 +5325,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) folio_put(folio); return handle_userfault(vmf, VM_UFFD_MISSING); } - - folio_ref_add(folio, nr_pages - 1); - add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); - count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC); - folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); - folio_add_lru_vma(folio, vma); -setpte: - if (vmf_orig_pte_uffd_wp(vmf)) - entry = pte_mkuffd_wp(entry); - set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages); - - /* No need to invalidate - it was non-present before */ - update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages); + map_anon_folio_pte_pf(folio, vmf->pte, vma, addr, + vmf_orig_pte_uffd_wp(vmf)); unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); -- cgit v1.2.3 From b90c453d2664ba445383956560581f9db708584f Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Wed, 25 Mar 2026 05:40:19 -0600 Subject: mm: introduce is_pmd_order helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to add mTHP support to khugepaged, we will often be checking if a given order is (or is not) a PMD order. Some places in the kernel already use this check, so lets create a simple helper function to keep the code clean and readable. Link: https://lkml.kernel.org/r/20260325114022.444081-3-npache@redhat.com Signed-off-by: Nico Pache Acked-by: David Hildenbrand (Arm) Reviewed-by: Baolin Wang Reviewed-by: Dev Jain Reviewed-by: Wei Yang Reviewed-by: Lance Yang Reviewed-by: Barry Song Reviewed-by: Zi Yan Reviewed-by: Pedro Falcato Reviewed-by: Lorenzo Stoakes Suggested-by: Lorenzo Stoakes Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Brendan Jackman Cc: Byungchul Park Cc: Catalin Marinas Cc: David Rientjes Cc: Gregory Price Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Joshua Hahn Cc: Kefeng Wang Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Matthew Brost Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Nanyong Sun Cc: Peter Xu Cc: Rafael Aquini Cc: Rakie Kim Cc: Randy Dunlap Cc: Ryan Roberts Cc: Shivank Garg Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Takashi Iwai (SUSE) Cc: Thomas Hellström Cc: Usama Arif Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 5 +++++ mm/huge_memory.c | 2 +- mm/khugepaged.c | 6 +++--- mm/memory.c | 2 +- mm/mempolicy.c | 2 +- mm/page_alloc.c | 4 ++-- mm/shmem.c | 3 +-- 7 files changed, 14 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a4d9f964dfde..bd7f0e1d8094 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -771,6 +771,11 @@ static inline bool pmd_is_huge(pmd_t pmd) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline bool is_pmd_order(unsigned int order) +{ + return order == HPAGE_PMD_ORDER; +} + static inline int split_folio_to_list_to_order(struct folio *folio, struct list_head *list, int new_order) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9fea52ccad56..1c1a7cf7b209 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -4159,7 +4159,7 @@ out_unlock: i_mmap_unlock_read(mapping); out: xas_destroy(&xas); - if (old_order == HPAGE_PMD_ORDER) + if (is_pmd_order(old_order)) count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); count_mthp_stat(old_order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED); return ret; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index f972a9a65e3a..c6a5d9d1f252 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1540,7 +1540,7 @@ static enum scan_result try_collapse_pte_mapped_thp(struct mm_struct *mm, unsign if (IS_ERR(folio)) return SCAN_PAGE_NULL; - if (folio_order(folio) != HPAGE_PMD_ORDER) { + if (!is_pmd_order(folio_order(folio))) { result = SCAN_PAGE_COMPOUND; goto drop_folio; } @@ -2023,7 +2023,7 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr, * we locked the first folio, then a THP might be there already. * This will be discovered on the first iteration. */ - if (folio_order(folio) == HPAGE_PMD_ORDER) { + if (is_pmd_order(folio_order(folio))) { result = SCAN_PTE_MAPPED_HUGEPAGE; goto out_unlock; } @@ -2351,7 +2351,7 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, continue; } - if (folio_order(folio) == HPAGE_PMD_ORDER) { + if (is_pmd_order(folio_order(folio))) { result = SCAN_PTE_MAPPED_HUGEPAGE; /* * PMD-sized THP implies that we can only try diff --git a/mm/memory.c b/mm/memory.c index 7c350a38fecf..6d54e5ec82f2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5435,7 +5435,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *pa if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) return ret; - if (folio_order(folio) != HPAGE_PMD_ORDER) + if (!is_pmd_order(folio_order(folio))) return ret; page = &folio->page; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0e5175f1c767..e5528c35bbb8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2449,7 +2449,7 @@ static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && /* filter "hugepage" allocation, unless from alloc_pages() */ - order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) { + is_pmd_order(order) && ilx != NO_INTERLEAVE_INDEX) { /* * For hugepage allocation and non-interleave policy which * allows the current node (or other explicitly preferred diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 937e9b850709..cdde59e56a55 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -651,7 +651,7 @@ static inline unsigned int order_to_pindex(int migratetype, int order) #ifdef CONFIG_TRANSPARENT_HUGEPAGE bool movable; if (order > PAGE_ALLOC_COSTLY_ORDER) { - VM_BUG_ON(order != HPAGE_PMD_ORDER); + VM_BUG_ON(!is_pmd_order(order)); movable = migratetype == MIGRATE_MOVABLE; @@ -683,7 +683,7 @@ static inline bool pcp_allowed_order(unsigned int order) if (order <= PAGE_ALLOC_COSTLY_ORDER) return true; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (order == HPAGE_PMD_ORDER) + if (is_pmd_order(order)) return true; #endif return false; diff --git a/mm/shmem.c b/mm/shmem.c index 5e7dcf5bc5d3..6fa1e8340c93 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -5558,8 +5558,7 @@ static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj, spin_unlock(&huge_shmem_orders_lock); } else if (sysfs_streq(buf, "inherit")) { /* Do not override huge allocation policy with non-PMD sized mTHP */ - if (shmem_huge == SHMEM_HUGE_FORCE && - order != HPAGE_PMD_ORDER) + if (shmem_huge == SHMEM_HUGE_FORCE && !is_pmd_order(order)) return -EINVAL; spin_lock(&huge_shmem_orders_lock); -- cgit v1.2.3 From 22688ade3b54b2f4f2887c7dad75db6d588ae07c Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 20 Mar 2026 23:13:41 +0100 Subject: mm/sparse: remove sparse_decode_mem_map() section_deactivate() applies to CONFIG_SPARSEMEM_VMEMMAP only. So we can just use pfn_to_page() (after making sure we have the start PFN of the section), and remove sparse_decode_mem_map(). Link: https://lkml.kernel.org/r/20260320-sparsemem_cleanups-v2-9-096addc8800d@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Mike Rapoport (Microsoft) Cc: Axel Rasmussen Cc: Liam Howlett Cc: Michal Hocko Cc: Oscar Salvador Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 2 -- mm/sparse.c | 16 +--------------- 2 files changed, 1 insertion(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index e77ef3d7ff73..815e908c4135 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -308,8 +308,6 @@ extern int sparse_add_section(int nid, unsigned long pfn, struct dev_pagemap *pgmap); extern void sparse_remove_section(unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap); -extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, - unsigned long pnum); extern struct zone *zone_for_pfn_range(enum mmop online_type, int nid, struct memory_group *group, unsigned long start_pfn, unsigned long nr_pages); diff --git a/mm/sparse.c b/mm/sparse.c index 875f718a4c79..b5825c9ee2f2 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -274,18 +274,6 @@ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long p return coded_mem_map; } -#ifdef CONFIG_MEMORY_HOTPLUG -/* - * Decode mem_map from the coded memmap - */ -struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum) -{ - /* mask off the extra low bits of information */ - coded_mem_map &= SECTION_MAP_MASK; - return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum); -} -#endif /* CONFIG_MEMORY_HOTPLUG */ - static void __meminit sparse_init_one_section(struct mem_section *ms, unsigned long pnum, struct page *mem_map, struct mem_section_usage *usage, unsigned long flags) @@ -754,8 +742,6 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, empty = is_subsection_map_empty(ms); if (empty) { - unsigned long section_nr = pfn_to_section_nr(pfn); - /* * Mark the section invalid so that valid_section() * return false. This prevents code from dereferencing @@ -774,7 +760,7 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, kfree_rcu(ms->usage, rcu); WRITE_ONCE(ms->usage, NULL); } - memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); + memmap = pfn_to_page(SECTION_ALIGN_DOWN(pfn)); } /* -- cgit v1.2.3 From fead6dcff83b02f8d6dc3c1ebbe4e09c05c54ee5 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 20 Mar 2026 23:13:43 +0100 Subject: mm: prepare to move subsection_map_init() to mm/sparse-vmemmap.c We want to move subsection_map_init() to mm/sparse-vmemmap.c. To prepare for getting rid of subsection_map_init() in mm/sparse.c completely, use a static inline function for !CONFIG_SPARSEMEM_VMEMMAP. While at it, move the declaration to internal.h and rename it to "sparse_init_subsection_map()". Link: https://lkml.kernel.org/r/20260320-sparsemem_cleanups-v2-11-096addc8800d@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Mike Rapoport (Microsoft) Cc: Axel Rasmussen Cc: Liam Howlett Cc: Michal Hocko Cc: Oscar Salvador Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 3 --- mm/internal.h | 12 ++++++++++++ mm/mm_init.c | 2 +- mm/sparse.c | 6 +----- 4 files changed, 14 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3f651baf7e2b..7cf4a194aea2 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1982,8 +1982,6 @@ struct mem_section_usage { unsigned long pageblock_flags[0]; }; -void subsection_map_init(unsigned long pfn, unsigned long nr_pages); - struct page; struct page_ext; struct mem_section { @@ -2376,7 +2374,6 @@ static inline unsigned long next_present_section_nr(unsigned long section_nr) #define sparse_vmemmap_init_nid_early(_nid) do {} while (0) #define sparse_vmemmap_init_nid_late(_nid) do {} while (0) #define pfn_in_present_section pfn_valid -#define subsection_map_init(_pfn, _nr_pages) do {} while (0) #endif /* CONFIG_SPARSEMEM */ /* diff --git a/mm/internal.h b/mm/internal.h index 62d80fd37ae1..11b0c91b6d9d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -959,12 +959,24 @@ void memmap_init_range(unsigned long, int, unsigned long, unsigned long, unsigned long, enum meminit_context, struct vmem_altmap *, int, bool); +/* + * mm/sparse.c + */ #ifdef CONFIG_SPARSEMEM void sparse_init(void); #else static inline void sparse_init(void) {} #endif /* CONFIG_SPARSEMEM */ +#ifdef CONFIG_SPARSEMEM_VMEMMAP +void sparse_init_subsection_map(unsigned long pfn, unsigned long nr_pages); +#else +static inline void sparse_init_subsection_map(unsigned long pfn, + unsigned long nr_pages) +{ +} +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* diff --git a/mm/mm_init.c b/mm/mm_init.c index 5b261f86ba6f..4324b93ccebd 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1896,7 +1896,7 @@ static void __init free_area_init(void) pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid, (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1); - subsection_map_init(start_pfn, end_pfn - start_pfn); + sparse_init_subsection_map(start_pfn, end_pfn - start_pfn); } /* Initialise every node */ diff --git a/mm/sparse.c b/mm/sparse.c index e2048b1fbf5f..c96ac5e70c22 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -185,7 +185,7 @@ static void subsection_mask_set(unsigned long *map, unsigned long pfn, bitmap_set(map, idx, end - idx + 1); } -void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages) +void __init sparse_init_subsection_map(unsigned long pfn, unsigned long nr_pages) { int end_sec_nr = pfn_to_section_nr(pfn + nr_pages - 1); unsigned long nr, start_sec_nr = pfn_to_section_nr(pfn); @@ -207,10 +207,6 @@ void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages) nr_pages -= pfns; } } -#else -void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages) -{ -} #endif /* Record a memory area against a node. */ -- cgit v1.2.3 From f62a3bf227c95a105fccb5a2062367387cd49430 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 20 Mar 2026 23:13:45 +0100 Subject: mm/sparse: move sparse_init_one_section() to internal.h While at it, convert the BUG_ON to a VM_WARN_ON_ONCE, avoid long lines, and merge sparse_encode_mem_map() into its only caller sparse_init_one_section(). Clarify the comment a bit, pointing at page_to_pfn(). [david@kernel.org: s/VM_WARN_ON/VM_WARN_ON_ONCE/] Link: https://lkml.kernel.org/r/6b04c1a1-74e7-42e8-8523-a40802e5dacc@kernel.org Link: https://lkml.kernel.org/r/20260320-sparsemem_cleanups-v2-13-096addc8800d@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Mike Rapoport (Microsoft) Cc: Axel Rasmussen Cc: Liam Howlett Cc: Michal Hocko Cc: Oscar Salvador Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 +- mm/internal.h | 22 ++++++++++++++++++++++ mm/sparse.c | 24 ------------------------ 3 files changed, 23 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7cf4a194aea2..ed335567d64e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1988,7 +1988,7 @@ struct mem_section { /* * This is, logically, a pointer to an array of struct * pages. However, it is stored with some other magic. - * (see sparse.c::sparse_init_one_section()) + * (see sparse_init_one_section()) * * Additionally during early boot we encode node id of * the location of the section here to guide allocation. diff --git a/mm/internal.h b/mm/internal.h index 11b0c91b6d9d..e14f58527688 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -964,6 +964,28 @@ void memmap_init_range(unsigned long, int, unsigned long, unsigned long, */ #ifdef CONFIG_SPARSEMEM void sparse_init(void); + +static inline void sparse_init_one_section(struct mem_section *ms, + unsigned long pnum, struct page *mem_map, + struct mem_section_usage *usage, unsigned long flags) +{ + unsigned long coded_mem_map; + + BUILD_BUG_ON(SECTION_MAP_LAST_BIT > PFN_SECTION_SHIFT); + + /* + * We encode the start PFN of the section into the mem_map such that + * page_to_pfn() on !CONFIG_SPARSEMEM_VMEMMAP can simply subtract it + * from the page pointer to obtain the PFN. + */ + coded_mem_map = (unsigned long)(mem_map - section_nr_to_pfn(pnum)); + VM_WARN_ON_ONCE(coded_mem_map & ~SECTION_MAP_MASK); + + ms->section_mem_map &= ~SECTION_MAP_MASK; + ms->section_mem_map |= coded_mem_map; + ms->section_mem_map |= flags | SECTION_HAS_MEM_MAP; + ms->usage = usage; +} #else static inline void sparse_init(void) {} #endif /* CONFIG_SPARSEMEM */ diff --git a/mm/sparse.c b/mm/sparse.c index 5c9cad390282..ed5de1a25f04 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -256,30 +256,6 @@ static void __init memblocks_present(void) memory_present(nid, start, end); } -/* - * Subtle, we encode the real pfn into the mem_map such that - * the identity pfn - section_mem_map will return the actual - * physical page frame number. - */ -static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum) -{ - unsigned long coded_mem_map = - (unsigned long)(mem_map - (section_nr_to_pfn(pnum))); - BUILD_BUG_ON(SECTION_MAP_LAST_BIT > PFN_SECTION_SHIFT); - BUG_ON(coded_mem_map & ~SECTION_MAP_MASK); - return coded_mem_map; -} - -static void __meminit sparse_init_one_section(struct mem_section *ms, - unsigned long pnum, struct page *mem_map, - struct mem_section_usage *usage, unsigned long flags) -{ - ms->section_mem_map &= ~SECTION_MAP_MASK; - ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) - | SECTION_HAS_MEM_MAP | flags; - ms->usage = usage; -} - static unsigned long usemap_size(void) { return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long); -- cgit v1.2.3 From 738de20c4fafe64290c5086d683254f60e837db6 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 20 Mar 2026 23:13:47 +0100 Subject: mm/sparse: move memory hotplug bits to sparse-vmemmap.c Let's move all memory hoptplug related code to sparse-vmemmap.c. We only have to expose sparse_index_init(). While at it, drop the definition of sparse_index_init() for !CONFIG_SPARSEMEM, which is unused, and place the declaration in internal.h. Link: https://lkml.kernel.org/r/20260320-sparsemem_cleanups-v2-15-096addc8800d@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Mike Rapoport (Microsoft) Cc: Axel Rasmussen Cc: Liam Howlett Cc: Michal Hocko Cc: Oscar Salvador Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 1 - mm/internal.h | 4 + mm/sparse-vmemmap.c | 304 ++++++++++++++++++++++++++++++++++++++++++++++++ mm/sparse.c | 310 +------------------------------------------------ 4 files changed, 310 insertions(+), 309 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ed335567d64e..4a20df132258 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -2370,7 +2370,6 @@ static inline unsigned long next_present_section_nr(unsigned long section_nr) #endif #else -#define sparse_index_init(_sec, _nid) do {} while (0) #define sparse_vmemmap_init_nid_early(_nid) do {} while (0) #define sparse_vmemmap_init_nid_late(_nid) do {} while (0) #define pfn_in_present_section pfn_valid diff --git a/mm/internal.h b/mm/internal.h index 4e753bbf00ae..9ae0ee6c34f9 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -964,6 +964,7 @@ void memmap_init_range(unsigned long, int, unsigned long, unsigned long, */ #ifdef CONFIG_SPARSEMEM void sparse_init(void); +int sparse_index_init(unsigned long section_nr, int nid); static inline void sparse_init_one_section(struct mem_section *ms, unsigned long pnum, struct page *mem_map, @@ -999,6 +1000,9 @@ static inline void __section_mark_present(struct mem_section *ms, static inline void sparse_init(void) {} #endif /* CONFIG_SPARSEMEM */ +/* + * mm/sparse-vmemmap.c + */ #ifdef CONFIG_SPARSEMEM_VMEMMAP void sparse_init_subsection_map(unsigned long pfn, unsigned long nr_pages); #else diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 842ed2f0bce6..24a37676cecb 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -591,3 +591,307 @@ void __init sparse_vmemmap_init_nid_late(int nid) hugetlb_vmemmap_init_late(nid); } #endif + +static void subsection_mask_set(unsigned long *map, unsigned long pfn, + unsigned long nr_pages) +{ + int idx = subsection_map_index(pfn); + int end = subsection_map_index(pfn + nr_pages - 1); + + bitmap_set(map, idx, end - idx + 1); +} + +void __init sparse_init_subsection_map(unsigned long pfn, unsigned long nr_pages) +{ + int end_sec_nr = pfn_to_section_nr(pfn + nr_pages - 1); + unsigned long nr, start_sec_nr = pfn_to_section_nr(pfn); + + for (nr = start_sec_nr; nr <= end_sec_nr; nr++) { + struct mem_section *ms; + unsigned long pfns; + + pfns = min(nr_pages, PAGES_PER_SECTION + - (pfn & ~PAGE_SECTION_MASK)); + ms = __nr_to_section(nr); + subsection_mask_set(ms->usage->subsection_map, pfn, pfns); + + pr_debug("%s: sec: %lu pfns: %lu set(%d, %d)\n", __func__, nr, + pfns, subsection_map_index(pfn), + subsection_map_index(pfn + pfns - 1)); + + pfn += pfns; + nr_pages -= pfns; + } +} + +#ifdef CONFIG_MEMORY_HOTPLUG + +/* Mark all memory sections within the pfn range as online */ +void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + unsigned long section_nr = pfn_to_section_nr(pfn); + struct mem_section *ms = __nr_to_section(section_nr); + + ms->section_mem_map |= SECTION_IS_ONLINE; + } +} + +/* Mark all memory sections within the pfn range as offline */ +void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + unsigned long section_nr = pfn_to_section_nr(pfn); + struct mem_section *ms = __nr_to_section(section_nr); + + ms->section_mem_map &= ~SECTION_IS_ONLINE; + } +} + +static struct page * __meminit populate_section_memmap(unsigned long pfn, + unsigned long nr_pages, int nid, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) +{ + return __populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap); +} + +static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, + struct vmem_altmap *altmap) +{ + unsigned long start = (unsigned long) pfn_to_page(pfn); + unsigned long end = start + nr_pages * sizeof(struct page); + + vmemmap_free(start, end, altmap); +} +static void free_map_bootmem(struct page *memmap) +{ + unsigned long start = (unsigned long)memmap; + unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION); + + vmemmap_free(start, end, NULL); +} + +static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages) +{ + DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; + DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 }; + struct mem_section *ms = __pfn_to_section(pfn); + unsigned long *subsection_map = ms->usage + ? &ms->usage->subsection_map[0] : NULL; + + subsection_mask_set(map, pfn, nr_pages); + if (subsection_map) + bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION); + + if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION), + "section already deactivated (%#lx + %ld)\n", + pfn, nr_pages)) + return -EINVAL; + + bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION); + return 0; +} + +static bool is_subsection_map_empty(struct mem_section *ms) +{ + return bitmap_empty(&ms->usage->subsection_map[0], + SUBSECTIONS_PER_SECTION); +} + +static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages) +{ + struct mem_section *ms = __pfn_to_section(pfn); + DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; + unsigned long *subsection_map; + int rc = 0; + + subsection_mask_set(map, pfn, nr_pages); + + subsection_map = &ms->usage->subsection_map[0]; + + if (bitmap_empty(map, SUBSECTIONS_PER_SECTION)) + rc = -EINVAL; + else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION)) + rc = -EEXIST; + else + bitmap_or(subsection_map, map, subsection_map, + SUBSECTIONS_PER_SECTION); + + return rc; +} + +/* + * To deactivate a memory region, there are 3 cases to handle: + * + * 1. deactivation of a partial hot-added section: + * a) section was present at memory init. + * b) section was hot-added post memory init. + * 2. deactivation of a complete hot-added section. + * 3. deactivation of a complete section from memory init. + * + * For 1, when subsection_map does not empty we will not be freeing the + * usage map, but still need to free the vmemmap range. + */ +static void section_deactivate(unsigned long pfn, unsigned long nr_pages, + struct vmem_altmap *altmap) +{ + struct mem_section *ms = __pfn_to_section(pfn); + bool section_is_early = early_section(ms); + struct page *memmap = NULL; + bool empty; + + if (clear_subsection_map(pfn, nr_pages)) + return; + + empty = is_subsection_map_empty(ms); + if (empty) { + /* + * Mark the section invalid so that valid_section() + * return false. This prevents code from dereferencing + * ms->usage array. + */ + ms->section_mem_map &= ~SECTION_HAS_MEM_MAP; + + /* + * When removing an early section, the usage map is kept (as the + * usage maps of other sections fall into the same page). It + * will be re-used when re-adding the section - which is then no + * longer an early section. If the usage map is PageReserved, it + * was allocated during boot. + */ + if (!PageReserved(virt_to_page(ms->usage))) { + kfree_rcu(ms->usage, rcu); + WRITE_ONCE(ms->usage, NULL); + } + memmap = pfn_to_page(SECTION_ALIGN_DOWN(pfn)); + } + + /* + * The memmap of early sections is always fully populated. See + * section_activate() and pfn_valid() . + */ + if (!section_is_early) { + memmap_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE))); + depopulate_section_memmap(pfn, nr_pages, altmap); + } else if (memmap) { + memmap_boot_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), + PAGE_SIZE))); + free_map_bootmem(memmap); + } + + if (empty) + ms->section_mem_map = (unsigned long)NULL; +} + +static struct page * __meminit section_activate(int nid, unsigned long pfn, + unsigned long nr_pages, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) +{ + struct mem_section *ms = __pfn_to_section(pfn); + struct mem_section_usage *usage = NULL; + struct page *memmap; + int rc; + + if (!ms->usage) { + usage = kzalloc(mem_section_usage_size(), GFP_KERNEL); + if (!usage) + return ERR_PTR(-ENOMEM); + ms->usage = usage; + } + + rc = fill_subsection_map(pfn, nr_pages); + if (rc) { + if (usage) + ms->usage = NULL; + kfree(usage); + return ERR_PTR(rc); + } + + /* + * The early init code does not consider partially populated + * initial sections, it simply assumes that memory will never be + * referenced. If we hot-add memory into such a section then we + * do not need to populate the memmap and can simply reuse what + * is already there. + */ + if (nr_pages < PAGES_PER_SECTION && early_section(ms)) + return pfn_to_page(pfn); + + memmap = populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap); + if (!memmap) { + section_deactivate(pfn, nr_pages, altmap); + return ERR_PTR(-ENOMEM); + } + memmap_pages_add(DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE)); + + return memmap; +} + +/** + * sparse_add_section - add a memory section, or populate an existing one + * @nid: The node to add section on + * @start_pfn: start pfn of the memory range + * @nr_pages: number of pfns to add in the section + * @altmap: alternate pfns to allocate the memmap backing store + * @pgmap: alternate compound page geometry for devmap mappings + * + * This is only intended for hotplug. + * + * Note that only VMEMMAP supports sub-section aligned hotplug, + * the proper alignment and size are gated by check_pfn_span(). + * + * + * Return: + * * 0 - On success. + * * -EEXIST - Section has been present. + * * -ENOMEM - Out of memory. + */ +int __meminit sparse_add_section(int nid, unsigned long start_pfn, + unsigned long nr_pages, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) +{ + unsigned long section_nr = pfn_to_section_nr(start_pfn); + struct mem_section *ms; + struct page *memmap; + int ret; + + ret = sparse_index_init(section_nr, nid); + if (ret < 0) + return ret; + + memmap = section_activate(nid, start_pfn, nr_pages, altmap, pgmap); + if (IS_ERR(memmap)) + return PTR_ERR(memmap); + + /* + * Poison uninitialized struct pages in order to catch invalid flags + * combinations. + */ + page_init_poison(memmap, sizeof(struct page) * nr_pages); + + ms = __nr_to_section(section_nr); + __section_mark_present(ms, section_nr); + + /* Align memmap to section boundary in the subsection case */ + if (section_nr_to_pfn(section_nr) != start_pfn) + memmap = pfn_to_page(section_nr_to_pfn(section_nr)); + sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0); + + return 0; +} + +void sparse_remove_section(unsigned long pfn, unsigned long nr_pages, + struct vmem_altmap *altmap) +{ + struct mem_section *ms = __pfn_to_section(pfn); + + if (WARN_ON_ONCE(!valid_section(ms))) + return; + + section_deactivate(pfn, nr_pages, altmap); +} +#endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/mm/sparse.c b/mm/sparse.c index ecd4c41c0ff0..007fd52c621e 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -79,7 +79,7 @@ static noinline struct mem_section __ref *sparse_index_alloc(int nid) return section; } -static int __meminit sparse_index_init(unsigned long section_nr, int nid) +int __meminit sparse_index_init(unsigned long section_nr, int nid) { unsigned long root = SECTION_NR_TO_ROOT(section_nr); struct mem_section *section; @@ -103,7 +103,7 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid) return 0; } #else /* !SPARSEMEM_EXTREME */ -static inline int sparse_index_init(unsigned long section_nr, int nid) +int sparse_index_init(unsigned long section_nr, int nid) { return 0; } @@ -167,40 +167,6 @@ static inline unsigned long first_present_section_nr(void) return next_present_section_nr(-1); } -#ifdef CONFIG_SPARSEMEM_VMEMMAP -static void subsection_mask_set(unsigned long *map, unsigned long pfn, - unsigned long nr_pages) -{ - int idx = subsection_map_index(pfn); - int end = subsection_map_index(pfn + nr_pages - 1); - - bitmap_set(map, idx, end - idx + 1); -} - -void __init sparse_init_subsection_map(unsigned long pfn, unsigned long nr_pages) -{ - int end_sec_nr = pfn_to_section_nr(pfn + nr_pages - 1); - unsigned long nr, start_sec_nr = pfn_to_section_nr(pfn); - - for (nr = start_sec_nr; nr <= end_sec_nr; nr++) { - struct mem_section *ms; - unsigned long pfns; - - pfns = min(nr_pages, PAGES_PER_SECTION - - (pfn & ~PAGE_SECTION_MASK)); - ms = __nr_to_section(nr); - subsection_mask_set(ms->usage->subsection_map, pfn, pfns); - - pr_debug("%s: sec: %lu pfns: %lu set(%d, %d)\n", __func__, nr, - pfns, subsection_map_index(pfn), - subsection_map_index(pfn + pfns - 1)); - - pfn += pfns; - nr_pages -= pfns; - } -} -#endif - /* Record a memory area against a node. */ static void __init memory_present(int nid, unsigned long start, unsigned long end) { @@ -482,275 +448,3 @@ void __init sparse_init(void) sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count); vmemmap_populate_print_last(); } - -#ifdef CONFIG_MEMORY_HOTPLUG - -/* Mark all memory sections within the pfn range as online */ -void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn) -{ - unsigned long pfn; - - for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { - unsigned long section_nr = pfn_to_section_nr(pfn); - struct mem_section *ms = __nr_to_section(section_nr); - - ms->section_mem_map |= SECTION_IS_ONLINE; - } -} - -/* Mark all memory sections within the pfn range as offline */ -void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) -{ - unsigned long pfn; - - for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { - unsigned long section_nr = pfn_to_section_nr(pfn); - struct mem_section *ms = __nr_to_section(section_nr); - - ms->section_mem_map &= ~SECTION_IS_ONLINE; - } -} - -static struct page * __meminit populate_section_memmap(unsigned long pfn, - unsigned long nr_pages, int nid, struct vmem_altmap *altmap, - struct dev_pagemap *pgmap) -{ - return __populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap); -} - -static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, - struct vmem_altmap *altmap) -{ - unsigned long start = (unsigned long) pfn_to_page(pfn); - unsigned long end = start + nr_pages * sizeof(struct page); - - vmemmap_free(start, end, altmap); -} -static void free_map_bootmem(struct page *memmap) -{ - unsigned long start = (unsigned long)memmap; - unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION); - - vmemmap_free(start, end, NULL); -} - -static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages) -{ - DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; - DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 }; - struct mem_section *ms = __pfn_to_section(pfn); - unsigned long *subsection_map = ms->usage - ? &ms->usage->subsection_map[0] : NULL; - - subsection_mask_set(map, pfn, nr_pages); - if (subsection_map) - bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION); - - if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION), - "section already deactivated (%#lx + %ld)\n", - pfn, nr_pages)) - return -EINVAL; - - bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION); - return 0; -} - -static bool is_subsection_map_empty(struct mem_section *ms) -{ - return bitmap_empty(&ms->usage->subsection_map[0], - SUBSECTIONS_PER_SECTION); -} - -static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages) -{ - struct mem_section *ms = __pfn_to_section(pfn); - DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; - unsigned long *subsection_map; - int rc = 0; - - subsection_mask_set(map, pfn, nr_pages); - - subsection_map = &ms->usage->subsection_map[0]; - - if (bitmap_empty(map, SUBSECTIONS_PER_SECTION)) - rc = -EINVAL; - else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION)) - rc = -EEXIST; - else - bitmap_or(subsection_map, map, subsection_map, - SUBSECTIONS_PER_SECTION); - - return rc; -} - -/* - * To deactivate a memory region, there are 3 cases to handle: - * - * 1. deactivation of a partial hot-added section: - * a) section was present at memory init. - * b) section was hot-added post memory init. - * 2. deactivation of a complete hot-added section. - * 3. deactivation of a complete section from memory init. - * - * For 1, when subsection_map does not empty we will not be freeing the - * usage map, but still need to free the vmemmap range. - */ -static void section_deactivate(unsigned long pfn, unsigned long nr_pages, - struct vmem_altmap *altmap) -{ - struct mem_section *ms = __pfn_to_section(pfn); - bool section_is_early = early_section(ms); - struct page *memmap = NULL; - bool empty; - - if (clear_subsection_map(pfn, nr_pages)) - return; - - empty = is_subsection_map_empty(ms); - if (empty) { - /* - * Mark the section invalid so that valid_section() - * return false. This prevents code from dereferencing - * ms->usage array. - */ - ms->section_mem_map &= ~SECTION_HAS_MEM_MAP; - - /* - * When removing an early section, the usage map is kept (as the - * usage maps of other sections fall into the same page). It - * will be re-used when re-adding the section - which is then no - * longer an early section. If the usage map is PageReserved, it - * was allocated during boot. - */ - if (!PageReserved(virt_to_page(ms->usage))) { - kfree_rcu(ms->usage, rcu); - WRITE_ONCE(ms->usage, NULL); - } - memmap = pfn_to_page(SECTION_ALIGN_DOWN(pfn)); - } - - /* - * The memmap of early sections is always fully populated. See - * section_activate() and pfn_valid() . - */ - if (!section_is_early) { - memmap_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE))); - depopulate_section_memmap(pfn, nr_pages, altmap); - } else if (memmap) { - memmap_boot_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), - PAGE_SIZE))); - free_map_bootmem(memmap); - } - - if (empty) - ms->section_mem_map = (unsigned long)NULL; -} - -static struct page * __meminit section_activate(int nid, unsigned long pfn, - unsigned long nr_pages, struct vmem_altmap *altmap, - struct dev_pagemap *pgmap) -{ - struct mem_section *ms = __pfn_to_section(pfn); - struct mem_section_usage *usage = NULL; - struct page *memmap; - int rc; - - if (!ms->usage) { - usage = kzalloc(mem_section_usage_size(), GFP_KERNEL); - if (!usage) - return ERR_PTR(-ENOMEM); - ms->usage = usage; - } - - rc = fill_subsection_map(pfn, nr_pages); - if (rc) { - if (usage) - ms->usage = NULL; - kfree(usage); - return ERR_PTR(rc); - } - - /* - * The early init code does not consider partially populated - * initial sections, it simply assumes that memory will never be - * referenced. If we hot-add memory into such a section then we - * do not need to populate the memmap and can simply reuse what - * is already there. - */ - if (nr_pages < PAGES_PER_SECTION && early_section(ms)) - return pfn_to_page(pfn); - - memmap = populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap); - if (!memmap) { - section_deactivate(pfn, nr_pages, altmap); - return ERR_PTR(-ENOMEM); - } - memmap_pages_add(DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE)); - - return memmap; -} - -/** - * sparse_add_section - add a memory section, or populate an existing one - * @nid: The node to add section on - * @start_pfn: start pfn of the memory range - * @nr_pages: number of pfns to add in the section - * @altmap: alternate pfns to allocate the memmap backing store - * @pgmap: alternate compound page geometry for devmap mappings - * - * This is only intended for hotplug. - * - * Note that only VMEMMAP supports sub-section aligned hotplug, - * the proper alignment and size are gated by check_pfn_span(). - * - * - * Return: - * * 0 - On success. - * * -EEXIST - Section has been present. - * * -ENOMEM - Out of memory. - */ -int __meminit sparse_add_section(int nid, unsigned long start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap, - struct dev_pagemap *pgmap) -{ - unsigned long section_nr = pfn_to_section_nr(start_pfn); - struct mem_section *ms; - struct page *memmap; - int ret; - - ret = sparse_index_init(section_nr, nid); - if (ret < 0) - return ret; - - memmap = section_activate(nid, start_pfn, nr_pages, altmap, pgmap); - if (IS_ERR(memmap)) - return PTR_ERR(memmap); - - /* - * Poison uninitialized struct pages in order to catch invalid flags - * combinations. - */ - page_init_poison(memmap, sizeof(struct page) * nr_pages); - - ms = __nr_to_section(section_nr); - __section_mark_present(ms, section_nr); - - /* Align memmap to section boundary in the subsection case */ - if (section_nr_to_pfn(section_nr) != start_pfn) - memmap = pfn_to_page(section_nr_to_pfn(section_nr)); - sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0); - - return 0; -} - -void sparse_remove_section(unsigned long pfn, unsigned long nr_pages, - struct vmem_altmap *altmap) -{ - struct mem_section *ms = __pfn_to_section(pfn); - - if (WARN_ON_ONCE(!valid_section(ms))) - return; - - section_deactivate(pfn, nr_pages, altmap); -} -#endif /* CONFIG_MEMORY_HOTPLUG */ -- cgit v1.2.3 From 6ebf98d71f9b509e833e0af00795ad3723d2f410 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Thu, 19 Mar 2026 09:19:41 +0100 Subject: mm: introduce CONFIG_NUMA_MIGRATION and simplify CONFIG_MIGRATION CONFIG_MEMORY_HOTREMOVE, CONFIG_COMPACTION and CONFIG_CMA all select CONFIG_MIGRATION, because they require it to work (users). Only CONFIG_NUMA_BALANCING and CONFIG_BALLOON_MIGRATION depend on CONFIG_MIGRATION. CONFIG_BALLOON_MIGRATION is not an actual user, but an implementation of migration support, so the dependency is correct (CONFIG_BALLOON_MIGRATION does not make any sense without CONFIG_MIGRATION). However, kconfig-language.rst clearly states "In general use select only for non-visible symbols". So far CONFIG_MIGRATION is user-visible ... and the dependencies rather confusing. The whole reason why CONFIG_MIGRATION is user-visible is because of CONFIG_NUMA: some users might want CONFIG_NUMA but not page migration support. Let's clean all that up by introducing a dedicated CONFIG_NUMA_MIGRATION config option for that purpose only. Make CONFIG_NUMA_BALANCING that so far depended on CONFIG_NUMA && CONFIG_MIGRATION to depend on CONFIG_MIGRATION instead. CONFIG_NUMA_MIGRATION will depend on CONFIG_NUMA && CONFIG_MMU. CONFIG_NUMA_MIGRATION is user-visible and will default to "y". We use that default so new configs will automatically enable it, just like it was the case with CONFIG_MIGRATION. The downside is that some configs that used to have CONFIG_MIGRATION=n might get it re-enabled by CONFIG_NUMA_MIGRATION=y, which shouldn't be a problem. CONFIG_MIGRATION is now a non-visible config option. Any code that select CONFIG_MIGRATION (as before) must depend directly or indirectly on CONFIG_MMU. CONFIG_NUMA_MIGRATION is responsible for any NUMA migration code, which is mempolicy migration code, memory-tiering code, and move_pages() code in migrate.c. CONFIG_NUMA_BALANCING uses its functionality. Note that this implies that with CONFIG_NUMA_MIGRATION=n, move_pages() will not be available even though CONFIG_MIGRATION=y, which is an expected change. In migrate.c, we can remove the CONFIG_NUMA check as both CONFIG_NUMA_MIGRATION and CONFIG_NUMA_BALANCING depend on it. With this change, CONFIG_MIGRATION is an internal config, all users of migration selects CONFIG_MIGRATION, and only CONFIG_BALLOON_MIGRATION depends on it. Link: https://lkml.kernel.org/r/20260319-config_migration-v1-2-42270124966f@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: Zi Yan Reviewed-by: Jonathan Cameron Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alistair Popple Cc: "Borislav Petkov (AMD)" Cc: Byungchul Park Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Gregory Price Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: "Huang, Ying" Cc: Ingo Molnar Cc: Joshua Hahn Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Brost Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Rakie Kim Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/memory-tiers.h | 2 +- init/Kconfig | 2 +- mm/Kconfig | 26 +++++++++++++------------- mm/memory-tiers.c | 12 ++++++------ mm/mempolicy.c | 2 +- mm/migrate.c | 5 ++--- 6 files changed, 24 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index 96987d9d95a8..7999c58629ee 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -52,7 +52,7 @@ int mt_perf_to_adistance(struct access_coordinate *perf, int *adist); struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct list_head *memory_types); void mt_put_memory_types(struct list_head *memory_types); -#ifdef CONFIG_MIGRATION +#ifdef CONFIG_NUMA_MIGRATION int next_demotion_node(int node, const nodemask_t *allowed_mask); void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets); bool node_is_toptier(int node); diff --git a/init/Kconfig b/init/Kconfig index 444ce811ea67..3648e401b78b 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -997,7 +997,7 @@ config NUMA_BALANCING bool "Memory placement aware NUMA scheduler" depends on ARCH_SUPPORTS_NUMA_BALANCING depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY - depends on SMP && NUMA && MIGRATION && !PREEMPT_RT + depends on SMP && NUMA_MIGRATION && !PREEMPT_RT help This option adds support for automatic NUMA aware memory/task placement. The mechanism is quite primitive and is based on migrating memory when diff --git a/mm/Kconfig b/mm/Kconfig index b2e21d873d3f..bd283958d675 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -627,20 +627,20 @@ config PAGE_REPORTING those pages to another entity, such as a hypervisor, so that the memory can be freed within the host for other uses. -# -# support for page migration -# -config MIGRATION - bool "Page migration" +config NUMA_MIGRATION + bool "NUMA page migration" default y - depends on (NUMA || MEMORY_HOTREMOVE || COMPACTION || CMA) && MMU - help - Allows the migration of the physical location of pages of processes - while the virtual addresses are not changed. This is useful in - two situations. The first is on NUMA systems to put pages nearer - to the processors accessing. The second is when allocating huge - pages as migration can relocate pages to satisfy a huge page - allocation instead of reclaiming. + depends on NUMA && MMU + select MIGRATION + help + Support the migration of pages to other NUMA nodes, available to + user space through interfaces like migrate_pages(), move_pages(), + and mbind(). Selecting this option also enables support for page + demotion for memory tiering. + +config MIGRATION + bool + depends on MMU config DEVICE_MIGRATION def_bool MIGRATION && ZONE_DEVICE diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 986f809376eb..54851d8a195b 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -69,7 +69,7 @@ bool folio_use_access_time(struct folio *folio) } #endif -#ifdef CONFIG_MIGRATION +#ifdef CONFIG_NUMA_MIGRATION static int top_tier_adistance; /* * node_demotion[] examples: @@ -129,7 +129,7 @@ static int top_tier_adistance; * */ static struct demotion_nodes *node_demotion __read_mostly; -#endif /* CONFIG_MIGRATION */ +#endif /* CONFIG_NUMA_MIGRATION */ static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms); @@ -273,7 +273,7 @@ static struct memory_tier *__node_get_memory_tier(int node) lockdep_is_held(&memory_tier_lock)); } -#ifdef CONFIG_MIGRATION +#ifdef CONFIG_NUMA_MIGRATION bool node_is_toptier(int node) { bool toptier; @@ -519,7 +519,7 @@ static void establish_demotion_targets(void) #else static inline void establish_demotion_targets(void) {} -#endif /* CONFIG_MIGRATION */ +#endif /* CONFIG_NUMA_MIGRATION */ static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype) { @@ -911,7 +911,7 @@ static int __init memory_tier_init(void) if (ret) panic("%s() failed to register memory tier subsystem\n", __func__); -#ifdef CONFIG_MIGRATION +#ifdef CONFIG_NUMA_MIGRATION node_demotion = kzalloc_objs(struct demotion_nodes, nr_node_ids); WARN_ON(!node_demotion); #endif @@ -938,7 +938,7 @@ subsys_initcall(memory_tier_init); bool numa_demotion_enabled = false; -#ifdef CONFIG_MIGRATION +#ifdef CONFIG_NUMA_MIGRATION #ifdef CONFIG_SYSFS static ssize_t demotion_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e5528c35bbb8..fd08771e2057 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1239,7 +1239,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, return err; } -#ifdef CONFIG_MIGRATION +#ifdef CONFIG_NUMA_MIGRATION static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { diff --git a/mm/migrate.c b/mm/migrate.c index 3323fc96b1cd..4241eb6eca00 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2222,8 +2222,7 @@ struct folio *alloc_migration_target(struct folio *src, unsigned long private) return __folio_alloc(gfp_mask, order, nid, mtc->nmask); } -#ifdef CONFIG_NUMA - +#ifdef CONFIG_NUMA_MIGRATION static int store_status(int __user *status, int start, int value, int nr) { while (nr-- > 0) { @@ -2622,6 +2621,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, { return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags); } +#endif /* CONFIG_NUMA_MIGRATION */ #ifdef CONFIG_NUMA_BALANCING /* @@ -2764,4 +2764,3 @@ int migrate_misplaced_folio(struct folio *folio, int node) return nr_remaining ? -EAGAIN : 0; } #endif /* CONFIG_NUMA_BALANCING */ -#endif /* CONFIG_NUMA */ -- cgit v1.2.3 From a6a8c087dce00eac0c6d03e560b0fa3d529afa5f Mon Sep 17 00:00:00 2001 From: Leno Hou Date: Thu, 19 Mar 2026 00:30:49 +0800 Subject: mm/mglru: fix cgroup OOM during MGLRU state switching When the Multi-Gen LRU (MGLRU) state is toggled dynamically, a race condition exists between the state switching and the memory reclaim path. This can lead to unexpected cgroup OOM kills, even when plenty of reclaimable memory is available. Problem Description ================== The issue arises from a "reclaim vacuum" during the transition. 1. When disabling MGLRU, lru_gen_change_state() sets lrugen->enabled to false before the pages are drained from MGLRU lists back to traditional LRU lists. 2. Concurrent reclaimers in shrink_lruvec() see lrugen->enabled as false and skip the MGLRU path. 3. However, these pages might not have reached the traditional LRU lists yet, or the changes are not yet visible to all CPUs due to a lack of synchronization. 4. get_scan_count() subsequently finds traditional LRU lists empty, concludes there is no reclaimable memory, and triggers an OOM kill. A similar race can occur during enablement, where the reclaimer sees the new state but the MGLRU lists haven't been populated via fill_evictable() yet. Solution ======== Introduce a 'switching' state (`lru_switch`) to bridge the transition. When transitioning, the system enters this intermediate state where the reclaimer is forced to attempt both MGLRU and traditional reclaim paths sequentially. This ensures that folios remain visible to at least one reclaim mechanism until the transition is fully materialized across all CPUs. Race & Mitigation ================ A race window exists between checking the 'draining' state and performing the actual list operations. For instance, a reclaimer might observe the draining state as false just before it changes, leading to a suboptimal reclaim path decision. However, this impact is effectively mitigated by the kernel's reclaim retry mechanism (e.g., in do_try_to_free_pages). If a reclaimer pass fails to find eligible folios due to a state transition race, subsequent retries in the loop will observe the updated state and correctly direct the scan to the appropriate LRU lists. This ensures the transient inconsistency does not escalate into a terminal OOM kill. This effectively reduce the race window that previously triggered OOMs under high memory pressure. This fix has been verified on v7.0.0-rc1; dynamic toggling of MGLRU functions correctly without triggering unexpected OOM kills. Link: https://lkml.kernel.org/r/20260319-b4-switch-mglru-v2-v5-1-8898491e5f17@gmail.com Signed-off-by: Leno Hou Acked-by: Yafang Shao Reviewed-by: Barry Song Reviewed-by: Axel Rasmussen Cc: Yuanchu Xie Cc: Wei Xu Cc: Jialing Wang Cc: Yu Zhao Cc: Kairui Song Cc: Bingfang Guo Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 11 +++++++++++ mm/rmap.c | 7 ++++++- mm/vmscan.c | 33 ++++++++++++++++++++++++--------- 3 files changed, 41 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index fa2d6ba811b5..2aedcff6a2c1 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -102,6 +102,12 @@ static __always_inline enum lru_list folio_lru_list(const struct folio *folio) #ifdef CONFIG_LRU_GEN +static inline bool lru_gen_switching(void) +{ + DECLARE_STATIC_KEY_FALSE(lru_switch); + + return static_branch_unlikely(&lru_switch); +} #ifdef CONFIG_LRU_GEN_ENABLED static inline bool lru_gen_enabled(void) { @@ -316,6 +322,11 @@ static inline bool lru_gen_enabled(void) return false; } +static inline bool lru_gen_switching(void) +{ + return false; +} + static inline bool lru_gen_in_fault(void) { return false; diff --git a/mm/rmap.c b/mm/rmap.c index abe4712a220c..78b7fb5f367c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -973,7 +973,12 @@ static bool folio_referenced_one(struct folio *folio, nr = folio_pte_batch(folio, pvmw.pte, pteval, max_nr); } - if (lru_gen_enabled() && pvmw.pte) { + /* + * When LRU is switching, we don’t know where the surrounding folios + * are. —they could be on active/inactive lists or on MGLRU. So the + * simplest approach is to disable this look-around optimization. + */ + if (lru_gen_enabled() && !lru_gen_switching() && pvmw.pte) { if (lru_gen_look_around(&pvmw, nr)) referenced++; } else if (pvmw.pte) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 641a6063f375..42f834c508bc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -905,7 +905,7 @@ static enum folio_references folio_check_references(struct folio *folio, if (referenced_ptes == -1) return FOLIOREF_KEEP; - if (lru_gen_enabled()) { + if (lru_gen_enabled() && !lru_gen_switching()) { if (!referenced_ptes) return FOLIOREF_RECLAIM; @@ -2308,7 +2308,7 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) unsigned long file; struct lruvec *target_lruvec; - if (lru_gen_enabled()) + if (lru_gen_enabled() && !lru_gen_switching()) return; target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); @@ -2647,6 +2647,7 @@ static bool can_age_anon_pages(struct lruvec *lruvec, #ifdef CONFIG_LRU_GEN +DEFINE_STATIC_KEY_FALSE(lru_switch); #ifdef CONFIG_LRU_GEN_ENABLED DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS); #define get_cap(cap) static_branch_likely(&lru_gen_caps[cap]) @@ -5181,6 +5182,8 @@ static void lru_gen_change_state(bool enabled) if (enabled == lru_gen_enabled()) goto unlock; + static_branch_enable_cpuslocked(&lru_switch); + if (enabled) static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); else @@ -5211,6 +5214,9 @@ static void lru_gen_change_state(bool enabled) cond_resched(); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); + + static_branch_disable_cpuslocked(&lru_switch); + unlock: mutex_unlock(&state_mutex); put_online_mems(); @@ -5783,9 +5789,12 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) bool proportional_reclaim; struct blk_plug plug; - if (lru_gen_enabled() && !root_reclaim(sc)) { + if ((lru_gen_enabled() || lru_gen_switching()) && !root_reclaim(sc)) { lru_gen_shrink_lruvec(lruvec, sc); - return; + + if (!lru_gen_switching()) + return; + } get_scan_count(lruvec, sc, nr); @@ -6045,10 +6054,13 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) struct lruvec *target_lruvec; bool reclaimable = false; - if (lru_gen_enabled() && root_reclaim(sc)) { + if ((lru_gen_enabled() || lru_gen_switching()) && root_reclaim(sc)) { memset(&sc->nr, 0, sizeof(sc->nr)); lru_gen_shrink_node(pgdat, sc); - return; + + if (!lru_gen_switching()) + return; + } target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); @@ -6318,7 +6330,7 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) struct lruvec *target_lruvec; unsigned long refaults; - if (lru_gen_enabled()) + if (lru_gen_enabled() && !lru_gen_switching()) return; target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); @@ -6708,9 +6720,12 @@ static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc) struct mem_cgroup *memcg; struct lruvec *lruvec; - if (lru_gen_enabled()) { + if (lru_gen_enabled() || lru_gen_switching()) { lru_gen_age_node(pgdat, sc); - return; + + if (!lru_gen_switching()) + return; + } lruvec = mem_cgroup_lruvec(NULL, pgdat); -- cgit v1.2.3 From a62ca3f40feaaaf0dfc4db1f2edeca5a70f4123d Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Sat, 21 Mar 2026 14:42:49 +0800 Subject: mm: change to return bool for ptep_test_and_clear_young() Patch series "change young flag check functions to return bool", v2. This is a cleanup patchset to change all young flag check functions to return bool, as discussed with David in the previous thread[1]. Since callers only care about whether the young flag was set, returning bool makes the intention clearer. No functional changes intended. This patch (of 6): Callers use ptep_test_and_clear_young() to clear the young flag and check whether it was set. Change the return type to bool to make the intention clearer. Link: https://lkml.kernel.org/r/cover.1774075004.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/57e70efa9703d43959aa645246ea3cbdba14fa17.1774075004.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 16 +++++++--------- arch/arm64/mm/contpte.c | 7 +++---- arch/microblaze/include/asm/pgtable.h | 2 +- arch/parisc/include/asm/pgtable.h | 7 ++++--- arch/powerpc/include/asm/book3s/32/pgtable.h | 4 ++-- arch/powerpc/include/asm/book3s/64/pgtable.h | 6 +++--- arch/powerpc/include/asm/nohash/pgtable.h | 4 ++-- arch/riscv/include/asm/pgtable.h | 4 ++-- arch/riscv/mm/pgtable.c | 7 +++---- arch/s390/include/asm/pgtable.h | 4 ++-- arch/x86/include/asm/pgtable.h | 4 ++-- arch/x86/mm/pgtable.c | 6 +++--- arch/xtensa/include/asm/pgtable.h | 9 ++++----- include/linux/pgtable.h | 16 ++++++++-------- 14 files changed, 46 insertions(+), 50 deletions(-) (limited to 'include') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index ab451d20e4c5..79596cc05dcb 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1282,9 +1282,8 @@ static inline void __pte_clear(struct mm_struct *mm, __set_pte(ptep, __pte(0)); } -static inline int __ptep_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, - pte_t *ptep) +static inline bool __ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) { pte_t old_pte, pte; @@ -1646,7 +1645,7 @@ extern void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr, extern pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full); -int contpte_test_and_clear_young_ptes(struct vm_area_struct *vma, +bool contpte_test_and_clear_young_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr); int contpte_clear_flush_young_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr); @@ -1813,9 +1812,8 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, } #define test_and_clear_young_ptes test_and_clear_young_ptes -static inline int test_and_clear_young_ptes(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep, - unsigned int nr) +static inline bool test_and_clear_young_ptes(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr) { if (likely(nr == 1 && !pte_cont(__ptep_get(ptep)))) return __ptep_test_and_clear_young(vma, addr, ptep); @@ -1824,8 +1822,8 @@ static inline int test_and_clear_young_ptes(struct vm_area_struct *vma, } #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline bool ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { return test_and_clear_young_ptes(vma, addr, ptep, 1); } diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c index 1519d090d5ea..a31cae78f712 100644 --- a/arch/arm64/mm/contpte.c +++ b/arch/arm64/mm/contpte.c @@ -508,9 +508,8 @@ pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm, } EXPORT_SYMBOL_GPL(contpte_get_and_clear_full_ptes); -int contpte_test_and_clear_young_ptes(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep, - unsigned int nr) +bool contpte_test_and_clear_young_ptes(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr) { /* * ptep_clear_flush_young() technically requires us to clear the access @@ -525,7 +524,7 @@ int contpte_test_and_clear_young_ptes(struct vm_area_struct *vma, */ unsigned long end = addr + nr * PAGE_SIZE; - int young = 0; + bool young = false; ptep = contpte_align_addr_ptep(&addr, &end, ptep, nr); for (; addr != end; ptep++, addr += PAGE_SIZE) diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index ea72291de553..7678c040a2fd 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -318,7 +318,7 @@ static inline void set_pte(pte_t *ptep, pte_t pte) #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG struct vm_area_struct; -static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, +static inline bool ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { return (pte_update(ptep, _PAGE_ACCESSED, 0) & _PAGE_ACCESSED) != 0; diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index f6fb99cb94d9..7097c785f690 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -438,16 +438,17 @@ static inline pte_t ptep_get(pte_t *ptep) } #define ptep_get ptep_get -static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) +static inline bool ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { pte_t pte; pte = ptep_get(ptep); if (!pte_young(pte)) { - return 0; + return false; } set_pte_at(vma->vm_mm, addr, ptep, pte_mkold(pte)); - return 1; + return true; } int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 001e28f9eabc..4a271318dee8 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -295,8 +295,8 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, p * for our hash-based implementation, we fix that up here. */ #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -static inline int __ptep_test_and_clear_young(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) +static inline bool __ptep_test_and_clear_young(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) { unsigned long old; old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0); diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 1a91762b455d..c049a2e26e25 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -349,13 +349,13 @@ static inline unsigned long pte_update(struct mm_struct *mm, unsigned long addr, * For radix: H_PAGE_HASHPTE should be zero. Hence we can use the same * function for both hash and radix. */ -static inline int __ptep_test_and_clear_young(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) +static inline bool __ptep_test_and_clear_young(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) { unsigned long old; if ((pte_raw(*ptep) & cpu_to_be64(_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0) - return 0; + return false; old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0); return (old & _PAGE_ACCESSED) != 0; } diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index e6da5eaccff6..3a6f20a1c800 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -101,8 +101,8 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, p } #endif -static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline bool ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { unsigned long old; diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index ab4ce1cc9d9c..643d12481b02 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -659,8 +659,8 @@ static inline void pte_clear(struct mm_struct *mm, extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty); #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG /* defined in mm/pgtable.c */ -extern int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, - pte_t *ptep); +bool ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep); #define __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, diff --git a/arch/riscv/mm/pgtable.c b/arch/riscv/mm/pgtable.c index b1ed2f14dc3a..9c4427d0b187 100644 --- a/arch/riscv/mm/pgtable.c +++ b/arch/riscv/mm/pgtable.c @@ -29,12 +29,11 @@ int ptep_set_access_flags(struct vm_area_struct *vma, return true; } -int ptep_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, - pte_t *ptep) +bool ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) { if (!pte_young(ptep_get(ptep))) - return 0; + return false; return test_and_clear_bit(_PAGE_ACCESSED_OFFSET, &pte_val(*ptep)); } EXPORT_SYMBOL_GPL(ptep_test_and_clear_young); diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 1c3c3be93be9..ef4748ee3a2b 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1164,8 +1164,8 @@ pte_t ptep_xchg_direct(struct mm_struct *, unsigned long, pte_t *, pte_t); pte_t ptep_xchg_lazy(struct mm_struct *, unsigned long, pte_t *, pte_t); #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline bool ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { pte_t pte = *ptep; diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 54289f4587a4..1d86fb33239f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1232,8 +1232,8 @@ extern int ptep_set_access_flags(struct vm_area_struct *vma, pte_t entry, int dirty); #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -extern int ptep_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep); +bool ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep); #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH extern int ptep_clear_flush_young(struct vm_area_struct *vma, diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 2e5ecfdce73c..5ee38dda9124 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -443,10 +443,10 @@ int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, } #endif -int ptep_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +bool ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { - int ret = 0; + bool ret = false; if (pte_young(*ptep)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index 61f07d981a94..f00a879dc298 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h @@ -304,15 +304,14 @@ set_pmd(pmd_t *pmdp, pmd_t pmdval) struct vm_area_struct; -static inline int -ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep) +static inline bool ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { pte_t pte = *ptep; if (!pte_young(pte)) - return 0; + return false; update_pte(ptep, pte_mkold(pte)); - return 1; + return true; } static inline pte_t diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 17d961c612fc..8e75dc9f7932 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -491,17 +491,17 @@ static inline pgd_t pgdp_get(pgd_t *pgdp) #endif #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, - pte_t *ptep) +static inline bool ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) { pte_t pte = ptep_get(ptep); - int r = 1; + bool young = true; + if (!pte_young(pte)) - r = 0; + young = false; else set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte)); - return r; + return young; } #endif @@ -1123,10 +1123,10 @@ static inline int clear_flush_young_ptes(struct vm_area_struct *vma, * * Returns: whether any PTE was young. */ -static inline int test_and_clear_young_ptes(struct vm_area_struct *vma, +static inline bool test_and_clear_young_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr) { - int young = 0; + bool young = false; for (;;) { young |= ptep_test_and_clear_young(vma, addr, ptep); -- cgit v1.2.3 From 06c4dfa3ced61635895d0e258da8dc63da539f42 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Sat, 21 Mar 2026 14:42:50 +0800 Subject: mm: change to return bool for ptep_clear_flush_young()/clear_flush_young_ptes() The ptep_clear_flush_young() and clear_flush_young_ptes() are used to clear the young flag and flush the TLB, returning whether the young flag was set. Change the return type to bool to make the intention clearer. Link: https://lkml.kernel.org/r/24af5144b96103631594501f77d4525f2475c1be.1774075004.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 17 ++++++++--------- arch/arm64/mm/contpte.c | 7 +++---- arch/parisc/include/asm/pgtable.h | 2 +- arch/parisc/kernel/cache.c | 8 ++++---- arch/powerpc/include/asm/nohash/64/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 4 ++-- arch/s390/include/asm/pgtable.h | 4 ++-- arch/x86/include/asm/pgtable.h | 4 ++-- arch/x86/mm/pgtable.c | 4 ++-- include/linux/pgtable.h | 8 ++++---- mm/pgtable-generic.c | 7 ++++--- 11 files changed, 33 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 79596cc05dcb..1009f719b157 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1298,10 +1298,10 @@ static inline bool __ptep_test_and_clear_young(struct vm_area_struct *vma, return pte_young(pte); } -static inline int __ptep_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep) +static inline bool __ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) { - int young = __ptep_test_and_clear_young(vma, address, ptep); + bool young = __ptep_test_and_clear_young(vma, address, ptep); if (young) { /* @@ -1647,7 +1647,7 @@ extern pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm, unsigned int nr, int full); bool contpte_test_and_clear_young_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr); -int contpte_clear_flush_young_ptes(struct vm_area_struct *vma, +bool contpte_clear_flush_young_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr); extern void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr); @@ -1829,8 +1829,8 @@ static inline bool ptep_test_and_clear_young(struct vm_area_struct *vma, } #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -static inline int ptep_clear_flush_young(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline bool ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { pte_t orig_pte = __ptep_get(ptep); @@ -1841,9 +1841,8 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma, } #define clear_flush_young_ptes clear_flush_young_ptes -static inline int clear_flush_young_ptes(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep, - unsigned int nr) +static inline bool clear_flush_young_ptes(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr) { if (likely(nr == 1 && !pte_cont(__ptep_get(ptep)))) return __ptep_clear_flush_young(vma, addr, ptep); diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c index a31cae78f712..2dc1b8ad71e8 100644 --- a/arch/arm64/mm/contpte.c +++ b/arch/arm64/mm/contpte.c @@ -534,11 +534,10 @@ bool contpte_test_and_clear_young_ptes(struct vm_area_struct *vma, } EXPORT_SYMBOL_GPL(contpte_test_and_clear_young_ptes); -int contpte_clear_flush_young_ptes(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep, - unsigned int nr) +bool contpte_clear_flush_young_ptes(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr) { - int young; + bool young; young = contpte_test_and_clear_young_ptes(vma, addr, ptep, nr); diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 7097c785f690..467b8547ac8b 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -451,7 +451,7 @@ static inline bool ptep_test_and_clear_young(struct vm_area_struct *vma, return true; } -int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); +bool ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); struct mm_struct; diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index b189265785dc..0170b69a21d3 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -781,18 +781,18 @@ void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned lon __flush_cache_page(vma, vmaddr, PFN_PHYS(page_to_pfn(page))); } -int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep) +bool ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { pte_t pte = ptep_get(ptep); if (!pte_young(pte)) - return 0; + return false; set_pte(ptep, pte_mkold(pte)); #if CONFIG_FLUSH_PAGE_ACCESSED __flush_cache_page(vma, addr, PFN_PHYS(pte_pfn(pte))); #endif - return 1; + return true; } /* diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 2deb955b7bc8..661eb3820d12 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -155,7 +155,7 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH #define ptep_clear_flush_young(__vma, __address, __ptep) \ ({ \ - int __young = ptep_test_and_clear_young(__vma, __address, __ptep);\ + bool __young = ptep_test_and_clear_young(__vma, __address, __ptep);\ __young; \ }) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 643d12481b02..b9dacfc280b1 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -695,8 +695,8 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, } #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -static inline int ptep_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep) +static inline bool ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) { /* * This comment is borrowed from x86, but applies equally to RISC-V: diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index ef4748ee3a2b..ac74b5076d8f 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1174,8 +1174,8 @@ static inline bool ptep_test_and_clear_young(struct vm_area_struct *vma, } #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -static inline int ptep_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep) +static inline bool ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) { return ptep_test_and_clear_young(vma, address, ptep); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1d86fb33239f..3993657e0a35 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1236,8 +1236,8 @@ bool ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -extern int ptep_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep); +bool ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep); #define __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 5ee38dda9124..1348384a3bb9 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -483,8 +483,8 @@ int pudp_test_and_clear_young(struct vm_area_struct *vma, } #endif -int ptep_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep) +bool ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) { /* * On x86 CPUs, clearing the accessed bit without a TLB flush diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 8e75dc9f7932..99450a3b0705 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -531,8 +531,8 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -int ptep_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep); +bool ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep); #endif #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH @@ -1086,10 +1086,10 @@ static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr, * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ -static inline int clear_flush_young_ptes(struct vm_area_struct *vma, +static inline bool clear_flush_young_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr) { - int young = 0; + bool young = false; for (;;) { young |= ptep_clear_flush_young(vma, addr, ptep); diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index af7966169d69..db0ee918b08a 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -81,10 +81,11 @@ int ptep_set_access_flags(struct vm_area_struct *vma, #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -int ptep_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep) +bool ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) { - int young; + bool young; + young = ptep_test_and_clear_young(vma, address, ptep); if (young) flush_tlb_page(vma, address); -- cgit v1.2.3 From 42e26354c4ef28772398b1d71b7477834037305c Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Sat, 21 Mar 2026 14:42:51 +0800 Subject: mm: change to return bool for pmdp_test_and_clear_young() Callers use pmdp_test_and_clear_young() to clear the young flag and check whether it was set for this PMD entry. Change the return type to bool to make the intention clearer. Link: https://lkml.kernel.org/r/f1d31307a13365d3d0fed5809727dcc2dd59631b.1774075004.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 5 ++--- arch/powerpc/include/asm/book3s/64/pgtable.h | 10 +++++----- arch/powerpc/mm/book3s64/pgtable.c | 4 ++-- arch/riscv/include/asm/pgtable.h | 4 ++-- arch/s390/include/asm/pgtable.h | 4 ++-- arch/x86/include/asm/pgtable.h | 4 ++-- arch/x86/mm/pgtable.c | 6 +++--- include/linux/pgtable.h | 19 +++++++++---------- 8 files changed, 27 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 1009f719b157..52bafe79c10a 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1320,9 +1320,8 @@ static inline bool __ptep_clear_flush_young(struct vm_area_struct *vma, #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG -static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, - pmd_t *pmdp) +static inline bool pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) { /* Operation applies to PMD table entry only if FEAT_HAFT is enabled */ VM_WARN_ON(pmd_table(READ_ONCE(*pmdp)) && !system_supports_haft()); diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index c049a2e26e25..8b354e81ab22 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1161,13 +1161,13 @@ pud_hugepage_update(struct mm_struct *mm, unsigned long addr, pud_t *pudp, * For radix we should always find H_PAGE_HASHPTE zero. Hence * the below will work for radix too */ -static inline int __pmdp_test_and_clear_young(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp) +static inline bool __pmdp_test_and_clear_young(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) { unsigned long old; if ((pmd_raw(*pmdp) & cpu_to_be64(_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0) - return 0; + return false; old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED, 0); return ((old & _PAGE_ACCESSED) != 0); } @@ -1300,8 +1300,8 @@ extern int pudp_set_access_flags(struct vm_area_struct *vma, pud_t entry, int dirty); #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG -extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp); +bool pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); #define __HAVE_ARCH_PUDP_TEST_AND_CLEAR_YOUNG extern int pudp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pud_t *pudp); diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 4b09c04654a8..c584321e3d41 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -98,8 +98,8 @@ int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, } -int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) +bool pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) { return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); } diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index b9dacfc280b1..67e7746e3fbe 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -1015,8 +1015,8 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma, } #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG -static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) +static inline bool pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) { return ptep_test_and_clear_young(vma, address, (pte_t *)pmdp); } diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index ac74b5076d8f..87a5082da28e 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1683,8 +1683,8 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma, } #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG -static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp) +static inline bool pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) { pmd_t pmd = *pmdp; diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 3993657e0a35..ba867bac6096 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1295,8 +1295,8 @@ extern int pudp_set_access_flags(struct vm_area_struct *vma, pud_t entry, int dirty); #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG -extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp); +bool pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp); extern int pudp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp); diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 1348384a3bb9..b09e8c5dadf9 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -456,10 +456,10 @@ bool ptep_test_and_clear_young(struct vm_area_struct *vma, } #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) -int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp) +bool pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) { - int ret = 0; + bool ret = false; if (pmd_young(*pmdp)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 99450a3b0705..6db900a5d38b 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -507,25 +507,24 @@ static inline bool ptep_test_and_clear_young(struct vm_area_struct *vma, #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) -static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, - pmd_t *pmdp) +static inline bool pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) { pmd_t pmd = *pmdp; - int r = 1; + bool young = true; + if (!pmd_young(pmd)) - r = 0; + young = false; else set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd)); - return r; + return young; } #else -static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, - pmd_t *pmdp) +static inline bool pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) { BUILD_BUG(); - return 0; + return false; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */ #endif -- cgit v1.2.3 From 2d46a397472191a10b0df294d64da542bfd1de57 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Sat, 21 Mar 2026 14:42:52 +0800 Subject: mm: change to return bool for pmdp_clear_flush_young() The pmdp_clear_flush_young() is used to clear the young flag and flush the TLB, returning whether the young flag was set for this PMD entry. Change the return type to bool to make the intention clearer. Link: https://lkml.kernel.org/r/a668b9a974c0d675e7a41f6973bcbe3336e8b373.1774075004.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Ritesh Harjani (IBM) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- arch/s390/include/asm/pgtable.h | 4 ++-- arch/x86/include/asm/pgtable.h | 4 ++-- arch/x86/mm/pgtable.c | 6 +++--- include/linux/pgtable.h | 10 +++++----- mm/pgtable-generic.c | 7 ++++--- 5 files changed, 16 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 87a5082da28e..40a6fb19dd1d 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1693,8 +1693,8 @@ static inline bool pmdp_test_and_clear_young(struct vm_area_struct *vma, } #define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH -static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp) +static inline bool pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) { VM_BUG_ON(addr & ~HPAGE_MASK); return pmdp_test_and_clear_young(vma, addr, pmdp); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index ba867bac6096..6c8f2b17d3f9 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1301,8 +1301,8 @@ extern int pudp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp); #define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH -extern int pmdp_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp); +bool pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index b09e8c5dadf9..fc1c996c5b2d 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -503,10 +503,10 @@ bool ptep_clear_flush_young(struct vm_area_struct *vma, } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -int pmdp_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) +bool pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) { - int young; + bool young; VM_BUG_ON(address & ~HPAGE_PMD_MASK); diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 6db900a5d38b..cdd68ed3ae1a 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -536,18 +536,18 @@ bool ptep_clear_flush_young(struct vm_area_struct *vma, #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH #ifdef CONFIG_TRANSPARENT_HUGEPAGE -extern int pmdp_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp); +bool pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); #else /* * Despite relevant to THP only, this API is called from generic rmap code * under PageTransHuge(), hence needs a dummy implementation for !THP */ -static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) +static inline bool pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) { BUILD_BUG(); - return 0; + return false; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index db0ee918b08a..b91b1a98029c 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -124,10 +124,11 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, #endif #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH -int pmdp_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) +bool pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) { - int young; + bool young; + VM_BUG_ON(address & ~HPAGE_PMD_MASK); young = pmdp_test_and_clear_young(vma, address, pmdp); if (young) -- cgit v1.2.3 From 1fc7dc675e26c43f3219d70a09b9f0c4aa43a13a Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Sat, 21 Mar 2026 14:42:54 +0800 Subject: mm: change to return bool for the MMU notifier's young flag check The MMU notifier young flag check related functions only return whether the young flag was set. Change the return type to bool to make the intention clearer. Link: https://lkml.kernel.org/r/a9ad3fe938002d87358e7bfca264f753ab602561.1774075004.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Ritesh Harjani (IBM) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mmu_notifier.h | 76 ++++++++++++++++++++------------------------ mm/internal.h | 16 +++++----- mm/mmu_notifier.c | 23 +++++++------- virt/kvm/kvm_main.c | 31 +++++++----------- 4 files changed, 66 insertions(+), 80 deletions(-) (limited to 'include') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 3705d350c863..17f2cdc77dd5 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -97,20 +97,20 @@ struct mmu_notifier_ops { * Start-end is necessary in case the secondary MMU is mapping the page * at a smaller granularity than the primary MMU. */ - int (*clear_flush_young)(struct mmu_notifier *subscription, - struct mm_struct *mm, - unsigned long start, - unsigned long end); + bool (*clear_flush_young)(struct mmu_notifier *subscription, + struct mm_struct *mm, + unsigned long start, + unsigned long end); /* * clear_young is a lightweight version of clear_flush_young. Like the * latter, it is supposed to test-and-clear the young/accessed bitflag * in the secondary pte, but it may omit flushing the secondary tlb. */ - int (*clear_young)(struct mmu_notifier *subscription, - struct mm_struct *mm, - unsigned long start, - unsigned long end); + bool (*clear_young)(struct mmu_notifier *subscription, + struct mm_struct *mm, + unsigned long start, + unsigned long end); /* * test_young is called to check the young/accessed bitflag in @@ -118,9 +118,9 @@ struct mmu_notifier_ops { * frequently used without actually clearing the flag or tearing * down the secondary mapping on the page. */ - int (*test_young)(struct mmu_notifier *subscription, - struct mm_struct *mm, - unsigned long address); + bool (*test_young)(struct mmu_notifier *subscription, + struct mm_struct *mm, + unsigned long address); /* * invalidate_range_start() and invalidate_range_end() must be @@ -376,14 +376,12 @@ mmu_interval_check_retry(struct mmu_interval_notifier *interval_sub, extern void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm); extern void __mmu_notifier_release(struct mm_struct *mm); -extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, - unsigned long start, - unsigned long end); -extern int __mmu_notifier_clear_young(struct mm_struct *mm, - unsigned long start, - unsigned long end); -extern int __mmu_notifier_test_young(struct mm_struct *mm, - unsigned long address); +bool __mmu_notifier_clear_flush_young(struct mm_struct *mm, + unsigned long start, unsigned long end); +bool __mmu_notifier_clear_young(struct mm_struct *mm, + unsigned long start, unsigned long end); +bool __mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address); extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r); extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r); extern void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, @@ -403,30 +401,28 @@ static inline void mmu_notifier_release(struct mm_struct *mm) __mmu_notifier_release(mm); } -static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, - unsigned long start, - unsigned long end) +static inline bool mmu_notifier_clear_flush_young(struct mm_struct *mm, + unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) return __mmu_notifier_clear_flush_young(mm, start, end); - return 0; + return false; } -static inline int mmu_notifier_clear_young(struct mm_struct *mm, - unsigned long start, - unsigned long end) +static inline bool mmu_notifier_clear_young(struct mm_struct *mm, + unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) return __mmu_notifier_clear_young(mm, start, end); - return 0; + return false; } -static inline int mmu_notifier_test_young(struct mm_struct *mm, - unsigned long address) +static inline bool mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) { if (mm_has_notifiers(mm)) return __mmu_notifier_test_young(mm, address); - return 0; + return false; } static inline void @@ -552,24 +548,22 @@ static inline void mmu_notifier_release(struct mm_struct *mm) { } -static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, - unsigned long start, - unsigned long end) +static inline bool mmu_notifier_clear_flush_young(struct mm_struct *mm, + unsigned long start, unsigned long end) { - return 0; + return false; } -static inline int mmu_notifier_clear_young(struct mm_struct *mm, - unsigned long start, - unsigned long end) +static inline bool mmu_notifier_clear_young(struct mm_struct *mm, + unsigned long start, unsigned long end) { - return 0; + return false; } -static inline int mmu_notifier_test_young(struct mm_struct *mm, - unsigned long address) +static inline bool mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) { - return 0; + return false; } static inline void diff --git a/mm/internal.h b/mm/internal.h index 9ae0ee6c34f9..3d3fa35e5fd1 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1860,10 +1860,10 @@ static inline int io_remap_pfn_range_complete(struct vm_area_struct *vma, } #ifdef CONFIG_MMU_NOTIFIER -static inline int clear_flush_young_ptes_notify(struct vm_area_struct *vma, +static inline bool clear_flush_young_ptes_notify(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr) { - int young; + bool young; young = clear_flush_young_ptes(vma, addr, ptep, nr); young |= mmu_notifier_clear_flush_young(vma->vm_mm, addr, @@ -1871,30 +1871,30 @@ static inline int clear_flush_young_ptes_notify(struct vm_area_struct *vma, return young; } -static inline int pmdp_clear_flush_young_notify(struct vm_area_struct *vma, +static inline bool pmdp_clear_flush_young_notify(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { - int young; + bool young; young = pmdp_clear_flush_young(vma, addr, pmdp); young |= mmu_notifier_clear_flush_young(vma->vm_mm, addr, addr + PMD_SIZE); return young; } -static inline int test_and_clear_young_ptes_notify(struct vm_area_struct *vma, +static inline bool test_and_clear_young_ptes_notify(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr) { - int young; + bool young; young = test_and_clear_young_ptes(vma, addr, ptep, nr); young |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + nr * PAGE_SIZE); return young; } -static inline int pmdp_test_and_clear_young_notify(struct vm_area_struct *vma, +static inline bool pmdp_test_and_clear_young_notify(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { - int young; + bool young; young = pmdp_test_and_clear_young(vma, addr, pmdp); young |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + PMD_SIZE); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 2502474b83b6..dc6f78d559f7 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -364,12 +364,12 @@ void __mmu_notifier_release(struct mm_struct *mm) * unmap the address and return 1 or 0 depending if the mapping previously * existed or not. */ -int __mmu_notifier_clear_flush_young(struct mm_struct *mm, - unsigned long start, - unsigned long end) +bool __mmu_notifier_clear_flush_young(struct mm_struct *mm, + unsigned long start, unsigned long end) { struct mmu_notifier *subscription; - int young = 0, id; + bool young = false; + int id; id = srcu_read_lock(&srcu); hlist_for_each_entry_srcu(subscription, @@ -384,12 +384,12 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, return young; } -int __mmu_notifier_clear_young(struct mm_struct *mm, - unsigned long start, - unsigned long end) +bool __mmu_notifier_clear_young(struct mm_struct *mm, + unsigned long start, unsigned long end) { struct mmu_notifier *subscription; - int young = 0, id; + bool young = false; + int id; id = srcu_read_lock(&srcu); hlist_for_each_entry_srcu(subscription, @@ -404,11 +404,12 @@ int __mmu_notifier_clear_young(struct mm_struct *mm, return young; } -int __mmu_notifier_test_young(struct mm_struct *mm, - unsigned long address) +bool __mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) { struct mmu_notifier *subscription; - int young = 0, id; + bool young = false; + int id; id = srcu_read_lock(&srcu); hlist_for_each_entry_srcu(subscription, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d0ab29672c71..82433f46c438 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -646,11 +646,9 @@ mmu_unlock: return r; } -static __always_inline int kvm_age_hva_range(struct mmu_notifier *mn, - unsigned long start, - unsigned long end, - gfn_handler_t handler, - bool flush_on_ret) +static __always_inline bool kvm_age_hva_range(struct mmu_notifier *mn, + unsigned long start, unsigned long end, gfn_handler_t handler, + bool flush_on_ret) { struct kvm *kvm = mmu_notifier_to_kvm(mn); const struct kvm_mmu_notifier_range range = { @@ -666,10 +664,8 @@ static __always_inline int kvm_age_hva_range(struct mmu_notifier *mn, return kvm_handle_hva_range(kvm, &range).ret; } -static __always_inline int kvm_age_hva_range_no_flush(struct mmu_notifier *mn, - unsigned long start, - unsigned long end, - gfn_handler_t handler) +static __always_inline bool kvm_age_hva_range_no_flush(struct mmu_notifier *mn, + unsigned long start, unsigned long end, gfn_handler_t handler) { return kvm_age_hva_range(mn, start, end, handler, false); } @@ -829,10 +825,8 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait); } -static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, - unsigned long end) +static bool kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, + struct mm_struct *mm, unsigned long start, unsigned long end) { trace_kvm_age_hva(start, end); @@ -840,10 +834,8 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, !IS_ENABLED(CONFIG_KVM_ELIDE_TLB_FLUSH_IF_YOUNG)); } -static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, - unsigned long end) +static bool kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, + struct mm_struct *mm, unsigned long start, unsigned long end) { trace_kvm_age_hva(start, end); @@ -863,9 +855,8 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, return kvm_age_hva_range_no_flush(mn, start, end, kvm_age_gfn); } -static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long address) +static bool kvm_mmu_notifier_test_young(struct mmu_notifier *mn, + struct mm_struct *mm, unsigned long address) { trace_kvm_test_age_hva(address); -- cgit v1.2.3 From 54fdcbfe1cbd1d8f06d0c57c8cc43ddcc1cd421c Mon Sep 17 00:00:00 2001 From: Ye Liu Date: Mon, 23 Mar 2026 17:03:04 +0800 Subject: mm: remove unused page_is_file_lru() function The page_is_file_lru() wrapper function is no longer used. The kernel has moved to folio-based APIs, and all callers should use folio_is_file_lru() instead. Remove the obsolete page-based wrapper function. Link: https://lkml.kernel.org/r/20260323090305.798057-1-ye.liu@linux.dev Signed-off-by: Ye Liu Acked-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 2aedcff6a2c1..7fc2ced00f8f 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -30,11 +30,6 @@ static inline int folio_is_file_lru(const struct folio *folio) return !folio_test_swapbacked(folio); } -static inline int page_is_file_lru(struct page *page) -{ - return folio_is_file_lru(page_folio(page)); -} - static __always_inline void __update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, long nr_pages) -- cgit v1.2.3 From 4ff07459db888054f68575646d7fe04f31f1e56d Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Thu, 19 Mar 2026 09:25:41 +0800 Subject: mm/huge_memory: fix folio isn't locked in softleaf_to_folio() On arm64 server, we found folio that get from migration entry isn't locked in softleaf_to_folio(). This issue triggers when mTHP splitting and zap_nonpresent_ptes() races, and the root cause is lack of memory barrier in softleaf_to_folio(). The race is as follows: CPU0 CPU1 deferred_split_scan() zap_nonpresent_ptes() lock folio split_folio() unmap_folio() change ptes to migration entries __split_folio_to_order() softleaf_to_folio() set flags(including PG_locked) for tail pages folio = pfn_folio(softleaf_to_pfn(entry)) smp_wmb() VM_WARN_ON_ONCE(!folio_test_locked(folio)) prep_compound_page() for tail pages In __split_folio_to_order(), smp_wmb() guarantees page flags of tail pages are visible before the tail page becomes non-compound. smp_wmb() should be paired with smp_rmb() in softleaf_to_folio(), which is missed. As a result, if zap_nonpresent_ptes() accesses migration entry that stores tail pfn, softleaf_to_folio() may see the updated compound_head of tail page before page->flags. This issue will trigger VM_WARN_ON_ONCE() in pfn_swap_entry_folio() because of the race between folio split and zap_nonpresent_ptes() leading to a folio incorrectly undergoing modification without a folio lock being held. This is a BUG_ON() before commit 93976a20345b ("mm: eliminate further swapops predicates"), which in merged in v6.19-rc1. To fix it, add missing smp_rmb() if the softleaf entry is migration entry in softleaf_to_folio() and softleaf_to_page(). [tujinjiang@huawei.com: update function name and comments] Link: https://lkml.kernel.org/r/20260321075214.3305564-1-tujinjiang@huawei.com Link: https://lkml.kernel.org/r/20260319012541.4158561-1-tujinjiang@huawei.com Fixes: e9b61f19858a ("thp: reintroduce split_huge_page()") Signed-off-by: Jinjiang Tu Acked-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Barry Song Cc: Kefeng Wang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Nanyong Sun Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/leafops.h | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/leafops.h b/include/linux/leafops.h index a9ff94b744f2..05673d3529e7 100644 --- a/include/linux/leafops.h +++ b/include/linux/leafops.h @@ -363,6 +363,23 @@ static inline unsigned long softleaf_to_pfn(softleaf_t entry) return swp_offset(entry) & SWP_PFN_MASK; } +static inline void softleaf_migration_sync(softleaf_t entry, + struct folio *folio) +{ + /* + * Ensure we do not race with split, which might alter tail pages into new + * folios and thus result in observing an unlocked folio. + * This matches the write barrier in __split_folio_to_order(). + */ + smp_rmb(); + + /* + * Any use of migration entries may only occur while the + * corresponding page is locked + */ + VM_WARN_ON_ONCE(!folio_test_locked(folio)); +} + /** * softleaf_to_page() - Obtains struct page for PFN encoded within leaf entry. * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true. @@ -374,11 +391,8 @@ static inline struct page *softleaf_to_page(softleaf_t entry) struct page *page = pfn_to_page(softleaf_to_pfn(entry)); VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); - /* - * Any use of migration entries may only occur while the - * corresponding page is locked - */ - VM_WARN_ON_ONCE(softleaf_is_migration(entry) && !PageLocked(page)); + if (softleaf_is_migration(entry)) + softleaf_migration_sync(entry, page_folio(page)); return page; } @@ -394,12 +408,8 @@ static inline struct folio *softleaf_to_folio(softleaf_t entry) struct folio *folio = pfn_folio(softleaf_to_pfn(entry)); VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); - /* - * Any use of migration entries may only occur while the - * corresponding folio is locked. - */ - VM_WARN_ON_ONCE(softleaf_is_migration(entry) && - !folio_test_locked(folio)); + if (softleaf_is_migration(entry)) + softleaf_migration_sync(entry, folio); return folio; } -- cgit v1.2.3 From 6bc0987d0b508b3768808efafa1e90041713526b Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:18 +0000 Subject: mm/vma: add vma_flags_empty(), vma_flags_and(), vma_flags_diff_pair() Patch series "mm/vma: convert vm_flags_t to vma_flags_t in vma code", v4. This series converts a lot of the existing use of the legacy vm_flags_t data type to the new vma_flags_t type which replaces it. In order to do so it adds a number of additional helpers: * vma_flags_empty() - Determines whether a vma_flags_t value has no bits set. * vma_flags_and() - Performs a bitwise AND between two vma_flags_t values. * vma_flags_diff_pair() - Determines which flags are not shared between a pair of VMA flags (typically non-constant values) * append_vma_flags() - Similar to mk_vma_flags(), but allows a vma_flags_t value to be specified (typically a constant value) which will be copied and appended to to create a new vma_flags_t value, with additional flags specified to append to it. * vma_flags_same() - Determines if a vma_flags_t value is exactly equal to a set of VMA flags. * vma_flags_same_mask() - Determines if a vma_flags_t value is eactly equal to another vma_flags_t value (typically constant). * vma_flags_same_pair() - Determines if a pair of vma_flags_t values are exactly equal to one another (typically both non-constant). * vma_flags_to_legacy() - Converts a vma_flags_t value to a vm_flags_t value, used to enable more iterative introduction of the use of vma_flags_t. * legacy_to_vma_flags() - Converts a vm_flags_t value to a vma_flags-t value, for the same purpose. * vma_flags_test_single_mask() - Tests whether a vma_flags_t value contain the single flag specified in an input vma_flags_t flag mask, or if that flag mask is empty, is defined to return false. Useful for config-predicated VMA flag mask defines. * vma_test() - Tests whether a VMA's flags contain a specific singular VMA flag. * vma_test_any() - Tests whether a VMA's flags contain any of a set of VMA flags. * vma_test_any_mask() - Tests whether a VMA's flags contain any of the flags specified in another, typically constant, vma_flags_t value. * vma_test_single_mask() - Tests whether a VMA's flags contain the single flag specified in an input vma_flags_t flag mask, or if that flag mask is empty, is defined to return false. Useful for config-predicated VMA flag mask defines. * vma_clear_flags() - Clears a specific set of VMA flags from a vma_flags_t value. * vma_clear_flags_mask() - Clears those flag set in a vma_flags_t value (typically constant) from a (typically not constant) vma_flags_t value. The series mostly focuses on the the VMA specific code, especially that contained in mm/vma.c and mm/vma.h. It updates both brk() and mmap() logic to utils vma_flags_t values as much as is practiaclly possible at this point, changing surrounding logic to be able to do so. It also updates the vma_modify_xxx() functions where they interact with VMA flags directly to use vm_flags_t values where possible. There is extensive testing added in the VMA userland tests to assert that all of these new VMA flag functions work correctly. This patch (of 25): Firstly, add the ability to determine if VMA flags are empty, that is no flags are set in a vma_flags_t value. Next, add the ability to obtain the equivalent of the bitwise and of two vma_flags_t values, via vma_flags_and_mask(). Next, add the ability to obtain the difference between two sets of VMA flags, that is the equivalent to the exclusive bitwise OR of the two sets of flags, via vma_flags_diff_pair(). vma_flags_xxx_mask() typically operates on a pointer to a vma_flags_t value, which is assumed to be an lvalue of some kind (such as a field in a struct or a stack variable) and an rvalue of some kind (typically a constant set of VMA flags obtained e.g. via mk_vma_flags() or equivalent). However vma_flags_diff_pair() is intended to operate on two lvalues, so use the _pair() suffix to make this clear. Finally, update VMA userland tests to add these helpers. We also port bitmap_xor() and __bitmap_xor() to the tools/ headers and source to allow the tests to work with vma_flags_diff_pair(). Link: https://lkml.kernel.org/r/cover.1774034900.git.ljs@kernel.org Link: https://lkml.kernel.org/r/53ab55b7da91425775e42c03177498ad6de88ef4.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 60 ++++++++++++++++++++++++++++++++++------- include/linux/mm_types.h | 8 ++++++ tools/include/linux/bitmap.h | 13 +++++++++ tools/lib/bitmap.c | 10 +++++++ tools/testing/vma/include/dup.h | 36 ++++++++++++++++++++++++- 5 files changed, 117 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 16a1ad9a3397..7954a7a2b811 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1048,6 +1048,19 @@ static __always_inline vma_flags_t __mk_vma_flags(size_t count, return flags; } +/* + * Helper macro which bitwise-or combines the specified input flags into a + * vma_flags_t bitmap value. E.g.: + * + * vma_flags_t flags = mk_vma_flags(VMA_IO_BIT, VMA_PFNMAP_BIT, + * VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT); + * + * The compiler cleverly optimises away all of the work and this ends up being + * equivalent to aggregating the values manually. + */ +#define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \ + (const vma_flag_t []){__VA_ARGS__}) + /* * Test whether a specific VMA flag is set, e.g.: * @@ -1062,17 +1075,30 @@ static __always_inline bool vma_flags_test(const vma_flags_t *flags, } /* - * Helper macro which bitwise-or combines the specified input flags into a - * vma_flags_t bitmap value. E.g.: - * - * vma_flags_t flags = mk_vma_flags(VMA_IO_BIT, VMA_PFNMAP_BIT, - * VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT); + * Obtain a set of VMA flags which contain the overlapping flags contained + * within flags and to_and. + */ +static __always_inline vma_flags_t vma_flags_and_mask(const vma_flags_t *flags, + vma_flags_t to_and) +{ + vma_flags_t dst; + unsigned long *bitmap_dst = dst.__vma_flags; + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_to_and = to_and.__vma_flags; + + bitmap_and(bitmap_dst, bitmap, bitmap_to_and, NUM_VMA_FLAG_BITS); + return dst; +} + +/* + * Obtain a set of VMA flags which contains the specified overlapping flags, + * e.g.: * - * The compiler cleverly optimises away all of the work and this ends up being - * equivalent to aggregating the values manually. + * vma_flags_t read_flags = vma_flags_and(&flags, VMA_READ_BIT, + * VMA_MAY_READ_BIT); */ -#define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \ - (const vma_flag_t []){__VA_ARGS__}) +#define vma_flags_and(flags, ...) \ + vma_flags_and_mask(flags, mk_vma_flags(__VA_ARGS__)) /* Test each of to_test flags in flags, non-atomically. */ static __always_inline bool vma_flags_test_any_mask(const vma_flags_t *flags, @@ -1146,6 +1172,22 @@ static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, #define vma_flags_clear(flags, ...) \ vma_flags_clear_mask(flags, mk_vma_flags(__VA_ARGS__)) +/* + * Obtain a VMA flags value containing those flags that are present in flags or + * flags_other but not in both. + */ +static __always_inline vma_flags_t vma_flags_diff_pair(const vma_flags_t *flags, + const vma_flags_t *flags_other) +{ + vma_flags_t dst; + const unsigned long *bitmap_other = flags_other->__vma_flags; + const unsigned long *bitmap = flags->__vma_flags; + unsigned long *bitmap_dst = dst.__vma_flags; + + bitmap_xor(bitmap_dst, bitmap, bitmap_other, NUM_VMA_FLAG_BITS); + return dst; +} + /* * Helper to test that ALL specified flags are set in a VMA. * diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f22aecb047b7..321aa150c1ee 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -870,6 +870,14 @@ typedef struct { #define EMPTY_VMA_FLAGS ((vma_flags_t){ }) +/* Are no flags set in the specified VMA flags? */ +static __always_inline bool vma_flags_empty(const vma_flags_t *flags) +{ + const unsigned long *bitmap = flags->__vma_flags; + + return bitmap_empty(bitmap, NUM_VMA_FLAG_BITS); +} + /* * Describes a VMA that is about to be mmap()'ed. Drivers may choose to * manipulate mutable fields which will cause those fields to be updated in the diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index 250883090a5d..845eda759f67 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h @@ -28,6 +28,8 @@ bool __bitmap_subset(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); +void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int nbits); #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) @@ -209,4 +211,15 @@ static inline void bitmap_clear(unsigned long *map, unsigned int start, else __bitmap_clear(map, start, nbits); } + +static __always_inline +void bitmap_xor(unsigned long *dst, const unsigned long *src1, + const unsigned long *src2, unsigned int nbits) +{ + if (small_const_nbits(nbits)) + *dst = *src1 ^ *src2; + else + __bitmap_xor(dst, src1, src2, nbits); +} + #endif /* _TOOLS_LINUX_BITMAP_H */ diff --git a/tools/lib/bitmap.c b/tools/lib/bitmap.c index aa83d22c45e3..fedc9070f0e4 100644 --- a/tools/lib/bitmap.c +++ b/tools/lib/bitmap.c @@ -169,3 +169,13 @@ bool __bitmap_subset(const unsigned long *bitmap1, return false; return true; } + +void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int bits) +{ + unsigned int k; + unsigned int nr = BITS_TO_LONGS(bits); + + for (k = 0; k < nr; k++) + dst[k] = bitmap1[k] ^ bitmap2[k]; +} diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 8865ffe046d8..8091a5caaeb8 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -422,6 +422,13 @@ struct vma_iterator { #define MAPCOUNT_ELF_CORE_MARGIN (5) #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) +static __always_inline bool vma_flags_empty(const vma_flags_t *flags) +{ + const unsigned long *bitmap = flags->__vma_flags; + + return bitmap_empty(bitmap, NUM_VMA_FLAG_BITS); +} + /* What action should be taken after an .mmap_prepare call is complete? */ enum mmap_action_type { MMAP_NOTHING, /* Mapping is complete, no further action. */ @@ -855,6 +862,21 @@ static __always_inline bool vma_flags_test(const vma_flags_t *flags, return test_bit((__force int)bit, bitmap); } +static __always_inline vma_flags_t vma_flags_and_mask(const vma_flags_t *flags, + vma_flags_t to_and) +{ + vma_flags_t dst; + unsigned long *bitmap_dst = dst.__vma_flags; + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_to_and = to_and.__vma_flags; + + bitmap_and(bitmap_dst, bitmap, bitmap_to_and, NUM_VMA_FLAG_BITS); + return dst; +} + +#define vma_flags_and(flags, ...) \ + vma_flags_and_mask(flags, mk_vma_flags(__VA_ARGS__)) + static __always_inline bool vma_flags_test_any_mask(const vma_flags_t *flags, vma_flags_t to_test) { @@ -901,8 +923,20 @@ static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, vma_flags_t #define vma_flags_clear(flags, ...) \ vma_flags_clear_mask(flags, mk_vma_flags(__VA_ARGS__)) +static __always_inline vma_flags_t vma_flags_diff_pair(const vma_flags_t *flags, + const vma_flags_t *flags_other) +{ + vma_flags_t dst; + const unsigned long *bitmap_other = flags_other->__vma_flags; + const unsigned long *bitmap = flags->__vma_flags; + unsigned long *bitmap_dst = dst.__vma_flags; + + bitmap_xor(bitmap_dst, bitmap, bitmap_other, NUM_VMA_FLAG_BITS); + return dst; +} + static inline bool vma_test_all_mask(const struct vm_area_struct *vma, - vma_flags_t flags) + vma_flags_t flags) { return vma_flags_test_all_mask(&vma->flags, flags); } -- cgit v1.2.3 From 8228e42b5f88aa68708ced277399ee3b59748627 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:20 +0000 Subject: mm/vma: add further vma_flags_t unions In order to utilise the new vma_flags_t type, we currently place it in union with legacy vm_flags fields of type vm_flags_t to make the transition smoother. Add vma_flags_t union entries for mm->def_flags and vmg->vm_flags - mm->def_vma_flags and vmg->vma_flags respectively. Once the conversion is complete, these will be replaced with vma_flags_t entries alone. Also update the VMA tests to reflect the change. Link: https://lkml.kernel.org/r/d507d542c089ba132e9da53f2ff7f80ca117c3b4.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 6 +++++- mm/vma.h | 6 +++++- tools/testing/vma/include/dup.h | 5 ++++- 3 files changed, 14 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 321aa150c1ee..8ef84849953f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1249,7 +1249,11 @@ struct mm_struct { unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ unsigned long stack_vm; /* VM_STACK */ - vm_flags_t def_flags; + union { + /* Temporary while VMA flags are being converted. */ + vm_flags_t def_flags; + vma_flags_t def_vma_flags; + }; /** * @write_protect_seq: Locked when any thread is write diff --git a/mm/vma.h b/mm/vma.h index eba388c61ef4..cf8926558bf6 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -98,7 +98,11 @@ struct vma_merge_struct { unsigned long end; pgoff_t pgoff; - vm_flags_t vm_flags; + union { + /* Temporary while VMA flags are being converted. */ + vm_flags_t vm_flags; + vma_flags_t vma_flags; + }; struct file *file; struct anon_vma *anon_vma; struct mempolicy *policy; diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 8091a5caaeb8..58e063b1ee27 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -33,7 +33,10 @@ struct mm_struct { unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ unsigned long stack_vm; /* VM_STACK */ - unsigned long def_flags; + union { + vm_flags_t def_flags; + vma_flags_t def_vma_flags; + }; mm_flags_t flags; /* Must use mm_flags_* helpers to access */ }; -- cgit v1.2.3 From 7ec1885a7e283caaf6566aedc1eea5988d545f97 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:22 +0000 Subject: mm/vma: use new VMA flags for sticky flags logic Use the new vma_flags_t flags implementation to perform the logic around sticky flags and what flags are ignored on VMA merge. We make use of the new vma_flags_empty(), vma_flags_diff_pair(), and vma_flags_and_mask() functionality. Also update the VMA tests accordingly. Link: https://lkml.kernel.org/r/369574f06360ffa44707047e3b58eb4897345fba.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 32 ++++++++++++++----------- mm/vma.c | 48 ++++++++++++++++++++++++++++---------- tools/testing/vma/include/custom.h | 5 ---- tools/testing/vma/include/dup.h | 9 +++++-- 4 files changed, 62 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7954a7a2b811..d7e647e31742 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -540,6 +540,7 @@ enum { /* VMA basic access permission flags */ #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) +#define VMA_ACCESS_FLAGS mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT) /* * Special vmas that are non-mergable, non-mlock()able. @@ -585,27 +586,32 @@ enum { * possesses it but the other does not, the merged VMA should nonetheless have * applied to it: * - * VM_SOFTDIRTY - if a VMA is marked soft-dirty, that is has not had its - * references cleared via /proc/$pid/clear_refs, any merged VMA - * should be considered soft-dirty also as it operates at a VMA - * granularity. + * VMA_SOFTDIRTY_BIT - if a VMA is marked soft-dirty, that is has not had its + * references cleared via /proc/$pid/clear_refs, any + * merged VMA should be considered soft-dirty also as it + * operates at a VMA granularity. * - * VM_MAYBE_GUARD - If a VMA may have guard regions in place it implies that - * mapped page tables may contain metadata not described by the - * VMA and thus any merged VMA may also contain this metadata, - * and thus we must make this flag sticky. + * VMA_MAYBE_GUARD_BIT - If a VMA may have guard regions in place it implies + * that mapped page tables may contain metadata not + * described by the VMA and thus any merged VMA may also + * contain this metadata, and thus we must make this flag + * sticky. */ -#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD) +#ifdef CONFIG_MEM_SOFT_DIRTY +#define VMA_STICKY_FLAGS mk_vma_flags(VMA_SOFTDIRTY_BIT, VMA_MAYBE_GUARD_BIT) +#else +#define VMA_STICKY_FLAGS mk_vma_flags(VMA_MAYBE_GUARD_BIT) +#endif /* * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one * of these flags and the other not does not preclude a merge. * - * VM_STICKY - When merging VMAs, VMA flags must match, unless they are - * 'sticky'. If any sticky flags exist in either VMA, we simply - * set all of them on the merged VMA. + * VMA_STICKY_FLAGS - When merging VMAs, VMA flags must match, unless they + * are 'sticky'. If any sticky flags exist in either VMA, + * we simply set all of them on the merged VMA. */ -#define VM_IGNORE_MERGE VM_STICKY +#define VMA_IGNORE_MERGE_FLAGS VMA_STICKY_FLAGS /* * Flags which should result in page tables being copied on fork. These are diff --git a/mm/vma.c b/mm/vma.c index 4d21e7d8e93c..6af26619e020 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -86,10 +86,15 @@ static bool vma_is_fork_child(struct vm_area_struct *vma) static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next) { struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev; + vma_flags_t diff; if (!mpol_equal(vmg->policy, vma_policy(vma))) return false; - if ((vma->vm_flags ^ vmg->vm_flags) & ~VM_IGNORE_MERGE) + + diff = vma_flags_diff_pair(&vma->flags, &vmg->vma_flags); + vma_flags_clear_mask(&diff, VMA_IGNORE_MERGE_FLAGS); + + if (!vma_flags_empty(&diff)) return false; if (vma->vm_file != vmg->file) return false; @@ -805,7 +810,8 @@ static bool can_merge_remove_vma(struct vm_area_struct *vma) static __must_check struct vm_area_struct *vma_merge_existing_range( struct vma_merge_struct *vmg) { - vm_flags_t sticky_flags = vmg->vm_flags & VM_STICKY; + vma_flags_t sticky_flags = vma_flags_and_mask(&vmg->vma_flags, + VMA_STICKY_FLAGS); struct vm_area_struct *middle = vmg->middle; struct vm_area_struct *prev = vmg->prev; struct vm_area_struct *next; @@ -898,15 +904,22 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( vma_start_write(middle); if (merge_right) { + vma_flags_t next_sticky; + vma_start_write(next); vmg->target = next; - sticky_flags |= (next->vm_flags & VM_STICKY); + next_sticky = vma_flags_and_mask(&next->flags, VMA_STICKY_FLAGS); + vma_flags_set_mask(&sticky_flags, next_sticky); } if (merge_left) { + vma_flags_t prev_sticky; + vma_start_write(prev); vmg->target = prev; - sticky_flags |= (prev->vm_flags & VM_STICKY); + + prev_sticky = vma_flags_and_mask(&prev->flags, VMA_STICKY_FLAGS); + vma_flags_set_mask(&sticky_flags, prev_sticky); } if (merge_both) { @@ -976,7 +989,7 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( if (err || commit_merge(vmg)) goto abort; - vm_flags_set(vmg->target, sticky_flags); + vma_set_flags_mask(vmg->target, sticky_flags); khugepaged_enter_vma(vmg->target, vmg->vm_flags); vmg->state = VMA_MERGE_SUCCESS; return vmg->target; @@ -1154,12 +1167,16 @@ int vma_expand(struct vma_merge_struct *vmg) struct vm_area_struct *target = vmg->target; struct vm_area_struct *next = vmg->next; bool remove_next = false; - vm_flags_t sticky_flags; + vma_flags_t sticky_flags = + vma_flags_and_mask(&vmg->vma_flags, VMA_STICKY_FLAGS); + vma_flags_t target_sticky; int ret = 0; mmap_assert_write_locked(vmg->mm); vma_start_write(target); + target_sticky = vma_flags_and_mask(&target->flags, VMA_STICKY_FLAGS); + if (next && target != next && vmg->end == next->vm_end) remove_next = true; @@ -1174,10 +1191,7 @@ int vma_expand(struct vma_merge_struct *vmg) VM_WARN_ON_VMG(target->vm_start < vmg->start || target->vm_end > vmg->end, vmg); - sticky_flags = vmg->vm_flags & VM_STICKY; - sticky_flags |= target->vm_flags & VM_STICKY; - if (remove_next) - sticky_flags |= next->vm_flags & VM_STICKY; + vma_flags_set_mask(&sticky_flags, target_sticky); /* * If we are removing the next VMA or copying from a VMA @@ -1194,13 +1208,18 @@ int vma_expand(struct vma_merge_struct *vmg) return ret; if (remove_next) { + vma_flags_t next_sticky; + vma_start_write(next); vmg->__remove_next = true; + + next_sticky = vma_flags_and_mask(&next->flags, VMA_STICKY_FLAGS); + vma_flags_set_mask(&sticky_flags, next_sticky); } if (commit_merge(vmg)) goto nomem; - vm_flags_set(target, sticky_flags); + vma_set_flags_mask(target, sticky_flags); return 0; nomem: @@ -1950,10 +1969,15 @@ out: */ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) { + vma_flags_t diff = vma_flags_diff_pair(&a->flags, &b->flags); + + vma_flags_clear_mask(&diff, VMA_ACCESS_FLAGS); + vma_flags_clear_mask(&diff, VMA_IGNORE_MERGE_FLAGS); + return a->vm_end == b->vm_start && mpol_equal(vma_policy(a), vma_policy(b)) && a->vm_file == b->vm_file && - !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_IGNORE_MERGE)) && + vma_flags_empty(&diff) && b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); } diff --git a/tools/testing/vma/include/custom.h b/tools/testing/vma/include/custom.h index 6200f938e586..7cdd0f60600a 100644 --- a/tools/testing/vma/include/custom.h +++ b/tools/testing/vma/include/custom.h @@ -134,8 +134,3 @@ static __always_inline bool vma_flags_same_mask(vma_flags_t *flags, vma_flags_same_mask(flags, mk_vma_flags(__VA_ARGS__)) #define VMA_SPECIAL_FLAGS mk_vma_flags(VMA_IO_BIT, VMA_DONTEXPAND_BIT, \ VMA_PFNMAP_BIT, VMA_MIXEDMAP_BIT) -#ifdef CONFIG_MEM_SOFT_DIRTY -#define VMA_STICKY_FLAGS mk_vma_flags(VMA_SOFTDIRTY_BIT, VMA_MAYBE_GUARD_BIT) -#else -#define VMA_STICKY_FLAGS mk_vma_flags(VMA_MAYBE_GUARD_BIT) -#endif diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 1dee78c34872..65134303b645 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -338,6 +338,7 @@ enum { /* VMA basic access permission flags */ #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) +#define VMA_ACCESS_FLAGS mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT) /* * Special vmas that are non-mergable, non-mlock()able. @@ -363,9 +364,13 @@ enum { #define CAP_IPC_LOCK 14 -#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD) +#ifdef CONFIG_MEM_SOFT_DIRTY +#define VMA_STICKY_FLAGS mk_vma_flags(VMA_SOFTDIRTY_BIT, VMA_MAYBE_GUARD_BIT) +#else +#define VMA_STICKY_FLAGS mk_vma_flags(VMA_MAYBE_GUARD_BIT) +#endif -#define VM_IGNORE_MERGE VM_STICKY +#define VMA_IGNORE_MERGE_FLAGS VMA_STICKY_FLAGS #define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD) -- cgit v1.2.3 From e8d464f4a94ccbcae8c9d3137ac5621b57ddd8a1 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:24 +0000 Subject: mm/vma: add append_vma_flags() helper In order to be able to efficiently combine VMA flag masks with additional VMA flag bits we need to extend the concept introduced in mk_vma_flags() and __mk_vma_flags() by allowing the specification of a VMA flag mask to append VMA flag bits to. Update __mk_vma_flags() to allow for this and update mk_vma_flags() accordingly, and also provide append_vma_flags() to allow for the caller to specify which VMA flags mask to append to. Finally, update the VMA flags tests to reflect the change. Link: https://lkml.kernel.org/r/9f928cd4688270002f2c0c3777fcc9b49cc7a8ea.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 20 ++++++++++++++------ tools/testing/vma/include/dup.h | 14 +++++++------- 2 files changed, 21 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index d7e647e31742..26cfb2fbe4db 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1042,13 +1042,11 @@ static __always_inline void vma_flags_set_flag(vma_flags_t *flags, __set_bit((__force int)bit, bitmap); } -static __always_inline vma_flags_t __mk_vma_flags(size_t count, - const vma_flag_t *bits) +static __always_inline vma_flags_t __mk_vma_flags(vma_flags_t flags, + size_t count, const vma_flag_t *bits) { - vma_flags_t flags; int i; - vma_flags_clear_all(&flags); for (i = 0; i < count; i++) vma_flags_set_flag(&flags, bits[i]); return flags; @@ -1064,8 +1062,18 @@ static __always_inline vma_flags_t __mk_vma_flags(size_t count, * The compiler cleverly optimises away all of the work and this ends up being * equivalent to aggregating the values manually. */ -#define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \ - (const vma_flag_t []){__VA_ARGS__}) +#define mk_vma_flags(...) __mk_vma_flags(EMPTY_VMA_FLAGS, \ + COUNT_ARGS(__VA_ARGS__), (const vma_flag_t []){__VA_ARGS__}) + +/* + * Helper macro which acts like mk_vma_flags, only appending to a copy of the + * specified flags rather than establishing new flags. E.g.: + * + * vma_flags_t flags = append_vma_flags(VMA_STACK_DEFAULT_FLAGS, VMA_STACK_BIT, + * VMA_ACCOUNT_BIT); + */ +#define append_vma_flags(flags, ...) __mk_vma_flags(flags, \ + COUNT_ARGS(__VA_ARGS__), (const vma_flag_t []){__VA_ARGS__}) /* * Test whether a specific VMA flag is set, e.g.: diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 3005e33d1ede..a2f311b5ea82 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -854,21 +854,21 @@ static inline void vm_flags_clear(struct vm_area_struct *vma, vma_flags_clear_word(&vma->flags, flags); } -static __always_inline vma_flags_t __mk_vma_flags(size_t count, - const vma_flag_t *bits) +static __always_inline vma_flags_t __mk_vma_flags(vma_flags_t flags, + size_t count, const vma_flag_t *bits) { - vma_flags_t flags; int i; - vma_flags_clear_all(&flags); for (i = 0; i < count; i++) vma_flags_set_flag(&flags, bits[i]); - return flags; } -#define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \ - (const vma_flag_t []){__VA_ARGS__}) +#define mk_vma_flags(...) __mk_vma_flags(EMPTY_VMA_FLAGS, \ + COUNT_ARGS(__VA_ARGS__), (const vma_flag_t []){__VA_ARGS__}) + +#define append_vma_flags(flags, ...) __mk_vma_flags(flags, \ + COUNT_ARGS(__VA_ARGS__), (const vma_flag_t []){__VA_ARGS__}) static __always_inline bool vma_flags_test(const vma_flags_t *flags, vma_flag_t bit) -- cgit v1.2.3 From 5fb55e951cf591c5e2d45273ceadbdcd0c44932c Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:26 +0000 Subject: mm: unexport vm_brk_flags() and eliminate vm_flags parameter This function is only used by elf_load(), and that is a static function that doesn't need an exported symbol to invoke an internal function, so un-EXPORT_SYMBOLS() it. Also, the vm_flags parameter is unnecessary, as we only ever set VM_EXEC, so simply make this parameter a boolean. While we're here, clean up the mm.h definitions for the various vm_xxx() helpers so we actually specify parameter names and elide the redundant extern's. Link: https://lkml.kernel.org/r/7bada48ddf3f9dbd3e6c4fc50ec2f4de97706f52.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- fs/binfmt_elf.c | 3 +-- include/linux/mm.h | 12 ++++++------ mm/mmap.c | 8 ++------ 3 files changed, 9 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index fb857faaf0d6..16a56b6b3f6c 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -453,14 +453,13 @@ static unsigned long elf_load(struct file *filep, unsigned long addr, zero_end = ELF_PAGEALIGN(zero_end); error = vm_brk_flags(zero_start, zero_end - zero_start, - prot & PROT_EXEC ? VM_EXEC : 0); + prot & PROT_EXEC); if (error) map_addr = error; } return map_addr; } - static unsigned long total_mapping_size(const struct elf_phdr *phdr, int nr) { elf_addr_t min_addr = -1; diff --git a/include/linux/mm.h b/include/linux/mm.h index 26cfb2fbe4db..5b85ffc2760c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3991,12 +3991,12 @@ static inline void mm_populate(unsigned long addr, unsigned long len) {} #endif /* This takes the mm semaphore itself */ -extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long); -extern int vm_munmap(unsigned long, size_t); -extern unsigned long __must_check vm_mmap(struct file *, unsigned long, - unsigned long, unsigned long, - unsigned long, unsigned long); -extern unsigned long __must_check vm_mmap_shadow_stack(unsigned long addr, +int __must_check vm_brk_flags(unsigned long addr, unsigned long request, bool is_exec); +int vm_munmap(unsigned long start, size_t len); +unsigned long __must_check vm_mmap(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flag, unsigned long offset); +unsigned long __must_check vm_mmap_shadow_stack(unsigned long addr, unsigned long len, unsigned long flags); struct vm_unmapped_area_info { diff --git a/mm/mmap.c b/mm/mmap.c index 79544d893411..2d2b814978bf 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1201,8 +1201,9 @@ out: return ret; } -int vm_brk_flags(unsigned long addr, unsigned long request, vm_flags_t vm_flags) +int vm_brk_flags(unsigned long addr, unsigned long request, bool is_exec) { + const vm_flags_t vm_flags = is_exec ? VM_EXEC : 0; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; unsigned long len; @@ -1217,10 +1218,6 @@ int vm_brk_flags(unsigned long addr, unsigned long request, vm_flags_t vm_flags) if (!len) return 0; - /* Until we need other flags, refuse anything except VM_EXEC. */ - if ((vm_flags & (~VM_EXEC)) != 0) - return -EINVAL; - if (mmap_write_lock_killable(mm)) return -EINTR; @@ -1246,7 +1243,6 @@ limits_failed: mmap_write_unlock(mm); return ret; } -EXPORT_SYMBOL(vm_brk_flags); static unsigned long tear_down_vmas(struct mm_struct *mm, struct vma_iterator *vmi, -- cgit v1.2.3 From 3ee584538259c356c66146ac46f2e4fd2ba28bee Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:27 +0000 Subject: mm/vma: introduce vma_flags_same[_mask/_pair]() Add helpers to determine if two sets of VMA flags are precisely the same, that is - that every flag set one is set in another, and neither contain any flags not set in the other. We also introduce vma_flags_same_pair() for cases where we want to compare two sets of VMA flags which are both non-const values. Also update the VMA tests to reflect the change, we already implicitly test that this functions correctly having used it for testing purposes previously. Link: https://lkml.kernel.org/r/4f764bf619e77205837c7c819b62139ef6337ca3.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 28 ++++++++++++++++++++++++++++ tools/testing/vma/include/custom.h | 11 ----------- tools/testing/vma/include/dup.h | 21 +++++++++++++++++++++ 3 files changed, 49 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5b85ffc2760c..1f3e9100164d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1202,6 +1202,34 @@ static __always_inline vma_flags_t vma_flags_diff_pair(const vma_flags_t *flags, return dst; } +/* Determine if flags and flags_other have precisely the same flags set. */ +static __always_inline bool vma_flags_same_pair(const vma_flags_t *flags, + const vma_flags_t *flags_other) +{ + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_other = flags_other->__vma_flags; + + return bitmap_equal(bitmap, bitmap_other, NUM_VMA_FLAG_BITS); +} + +/* Determine if flags and flags_other have precisely the same flags set. */ +static __always_inline bool vma_flags_same_mask(const vma_flags_t *flags, + vma_flags_t flags_other) +{ + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_other = flags_other.__vma_flags; + + return bitmap_equal(bitmap, bitmap_other, NUM_VMA_FLAG_BITS); +} + +/* + * Helper macro to determine if only the specific flags are set, e.g.: + * + * if (vma_flags_same(&flags, VMA_WRITE_BIT) { ... } + */ +#define vma_flags_same(flags, ...) \ + vma_flags_same_mask(flags, mk_vma_flags(__VA_ARGS__)) + /* * Helper to test that ALL specified flags are set in a VMA. * diff --git a/tools/testing/vma/include/custom.h b/tools/testing/vma/include/custom.h index 8f33df02816a..2c498e713fbd 100644 --- a/tools/testing/vma/include/custom.h +++ b/tools/testing/vma/include/custom.h @@ -102,16 +102,5 @@ static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) return PAGE_SIZE; } -/* Place here until needed in the kernel code. */ -static __always_inline bool vma_flags_same_mask(vma_flags_t *flags, - vma_flags_t flags_other) -{ - const unsigned long *bitmap = flags->__vma_flags; - const unsigned long *bitmap_other = flags_other.__vma_flags; - - return bitmap_equal(bitmap, bitmap_other, NUM_VMA_FLAG_BITS); -} -#define vma_flags_same(flags, ...) \ - vma_flags_same_mask(flags, mk_vma_flags(__VA_ARGS__)) #define VMA_SPECIAL_FLAGS mk_vma_flags(VMA_IO_BIT, VMA_DONTEXPAND_BIT, \ VMA_PFNMAP_BIT, VMA_MIXEDMAP_BIT) diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 802b3d97b627..65f630923461 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -954,6 +954,27 @@ static __always_inline vma_flags_t vma_flags_diff_pair(const vma_flags_t *flags, return dst; } +static __always_inline bool vma_flags_same_pair(const vma_flags_t *flags, + const vma_flags_t *flags_other) +{ + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_other = flags_other->__vma_flags; + + return bitmap_equal(bitmap, bitmap_other, NUM_VMA_FLAG_BITS); +} + +static __always_inline bool vma_flags_same_mask(const vma_flags_t *flags, + vma_flags_t flags_other) +{ + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_other = flags_other.__vma_flags; + + return bitmap_equal(bitmap, bitmap_other, NUM_VMA_FLAG_BITS); +} + +#define vma_flags_same(flags, ...) \ + vma_flags_same_mask(flags, mk_vma_flags(__VA_ARGS__)) + static inline bool vma_test_all_mask(const struct vm_area_struct *vma, vma_flags_t flags) { -- cgit v1.2.3 From c8555bc95d6222aa729b3a1195e07e566707ec02 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:28 +0000 Subject: mm/vma: introduce [vma_flags,legacy]_to_[legacy,vma_flags]() helpers While we are still converting VMA flags from vma_flags_t to vm_flags_t, introduce helpers to convert between the two to allow for iterative development without having to 'change the world' in a single commit'. Also update VMA flags tests to reflect the change. Finally, refresh vma_flags_overwrite_word(), vma_flag_overwrite_word_once(), vma_flags_set_word() and vma_flags_clear_word() in the VMA tests to reflect current kernel implementations - this should make no functional difference, but keeps the logic consistent between the two. Link: https://lkml.kernel.org/r/d3569470dbb3ae79134ca7c3eb3fc4df7086e874.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 26 ++++++++++++++++++++++++++ tools/testing/vma/include/dup.h | 36 ++++++++++++++++++++++++++++++++---- 2 files changed, 58 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8ef84849953f..1da8fb04133f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1069,6 +1069,18 @@ static __always_inline void vma_flags_clear_all(vma_flags_t *flags) bitmap_zero(flags->__vma_flags, NUM_VMA_FLAG_BITS); } +/* + * Helper function which converts a vma_flags_t value to a legacy vm_flags_t + * value. This is only valid if the input flags value can be expressed in a + * system word. + * + * Will be removed once the conversion to VMA flags is complete. + */ +static __always_inline vm_flags_t vma_flags_to_legacy(vma_flags_t flags) +{ + return (vm_flags_t)flags.__vma_flags[0]; +} + /* * Copy value to the first system word of VMA flags, non-atomically. * @@ -1082,6 +1094,20 @@ static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long va bitmap[0] = value; } +/* + * Helper function which converts a legacy vm_flags_t value to a vma_flags_t + * value. + * + * Will be removed once the conversion to VMA flags is complete. + */ +static __always_inline vma_flags_t legacy_to_vma_flags(vm_flags_t flags) +{ + vma_flags_t ret = EMPTY_VMA_FLAGS; + + vma_flags_overwrite_word(&ret, flags); + return ret; +} + /* * Copy value to the first system word of VMA flags ONCE, non-atomically. * diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 65f630923461..f49af21319ba 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -766,7 +766,9 @@ static inline bool mm_flags_test(int flag, const struct mm_struct *mm) */ static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value) { - *ACCESS_PRIVATE(flags, __vma_flags) = value; + unsigned long *bitmap = flags->__vma_flags; + + bitmap[0] = value; } /* @@ -777,7 +779,7 @@ static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long va */ static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value) { - unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + unsigned long *bitmap = flags->__vma_flags; WRITE_ONCE(*bitmap, value); } @@ -785,7 +787,7 @@ static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned lo /* Update the first system word of VMA flags setting bits, non-atomically. */ static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) { - unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + unsigned long *bitmap = flags->__vma_flags; *bitmap |= value; } @@ -793,7 +795,7 @@ static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) /* Update the first system word of VMA flags clearing bits, non-atomically. */ static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value) { - unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + unsigned long *bitmap = flags->__vma_flags; *bitmap &= ~value; } @@ -803,6 +805,32 @@ static __always_inline void vma_flags_clear_all(vma_flags_t *flags) bitmap_zero(ACCESS_PRIVATE(flags, __vma_flags), NUM_VMA_FLAG_BITS); } +/* + * Helper function which converts a vma_flags_t value to a legacy vm_flags_t + * value. This is only valid if the input flags value can be expressed in a + * system word. + * + * Will be removed once the conversion to VMA flags is complete. + */ +static __always_inline vm_flags_t vma_flags_to_legacy(vma_flags_t flags) +{ + return (vm_flags_t)flags.__vma_flags[0]; +} + +/* + * Helper function which converts a legacy vm_flags_t value to a vma_flags_t + * value. + * + * Will be removed once the conversion to VMA flags is complete. + */ +static __always_inline vma_flags_t legacy_to_vma_flags(vm_flags_t flags) +{ + vma_flags_t ret = EMPTY_VMA_FLAGS; + + vma_flags_overwrite_word(&ret, flags); + return ret; +} + static __always_inline void vma_flags_set_flag(vma_flags_t *flags, vma_flag_t bit) { -- cgit v1.2.3 From fb67bba5d9b8561f433695c8916c097910193561 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:30 +0000 Subject: mm/vma: introduce vma_test[_any[_mask]](), and make inlining consistent Introduce helper functions and macros to make it convenient to test flags and flag masks for VMAs, specifically: * vma_test() - determine if a single VMA flag is set in a VMA. * vma_test_any_mask() - determine if any flags in a vma_flags_t value are set in a VMA. * vma_test_any() - Helper macro to test if any of specific flags are set. Also, there are a mix of 'inline's and '__always_inline's in VMA helper function declarations, update to consistently use __always_inline. Finally, update the VMA tests to reflect the changes. Link: https://lkml.kernel.org/r/be1d71f08307d747a82232cbd8664a88c0f41419.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 49 +++++++++++++++++++++++++++------ include/linux/mm_types.h | 12 +++++--- tools/testing/vma/include/dup.h | 61 +++++++++++++++++++++++++++-------------- 3 files changed, 88 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1f3e9100164d..f704d7cf2871 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -994,7 +994,8 @@ static inline void vm_flags_mod(struct vm_area_struct *vma, __vm_flags_mod(vma, set, clear); } -static inline bool __vma_atomic_valid_flag(struct vm_area_struct *vma, vma_flag_t bit) +static __always_inline bool __vma_atomic_valid_flag(struct vm_area_struct *vma, + vma_flag_t bit) { const vm_flags_t mask = BIT((__force int)bit); @@ -1009,7 +1010,8 @@ static inline bool __vma_atomic_valid_flag(struct vm_area_struct *vma, vma_flag_ * Set VMA flag atomically. Requires only VMA/mmap read lock. Only specific * valid flags are allowed to do this. */ -static inline void vma_set_atomic_flag(struct vm_area_struct *vma, vma_flag_t bit) +static __always_inline void vma_set_atomic_flag(struct vm_area_struct *vma, + vma_flag_t bit) { unsigned long *bitmap = vma->flags.__vma_flags; @@ -1025,7 +1027,8 @@ static inline void vma_set_atomic_flag(struct vm_area_struct *vma, vma_flag_t bi * This is necessarily racey, so callers must ensure that serialisation is * achieved through some other means, or that races are permissible. */ -static inline bool vma_test_atomic_flag(struct vm_area_struct *vma, vma_flag_t bit) +static __always_inline bool vma_test_atomic_flag(struct vm_area_struct *vma, + vma_flag_t bit) { if (__vma_atomic_valid_flag(vma, bit)) return test_bit((__force int)bit, &vma->vm_flags); @@ -1230,13 +1233,41 @@ static __always_inline bool vma_flags_same_mask(const vma_flags_t *flags, #define vma_flags_same(flags, ...) \ vma_flags_same_mask(flags, mk_vma_flags(__VA_ARGS__)) +/* + * Test whether a specific flag in the VMA is set, e.g.: + * + * if (vma_test(vma, VMA_READ_BIT)) { ... } + */ +static __always_inline bool vma_test(const struct vm_area_struct *vma, + vma_flag_t bit) +{ + return vma_flags_test(&vma->flags, bit); +} + +/* Helper to test any VMA flags in a VMA . */ +static __always_inline bool vma_test_any_mask(const struct vm_area_struct *vma, + vma_flags_t flags) +{ + return vma_flags_test_any_mask(&vma->flags, flags); +} + +/* + * Helper macro for testing whether any VMA flags are set in a VMA, + * e.g.: + * + * if (vma_test_any(vma, VMA_IO_BIT, VMA_PFNMAP_BIT, + * VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT)) { ... } + */ +#define vma_test_any(vma, ...) \ + vma_test_any_mask(vma, mk_vma_flags(__VA_ARGS__)) + /* * Helper to test that ALL specified flags are set in a VMA. * * Note: appropriate locks must be held, this function does not acquire them for * you. */ -static inline bool vma_test_all_mask(const struct vm_area_struct *vma, +static __always_inline bool vma_test_all_mask(const struct vm_area_struct *vma, vma_flags_t flags) { return vma_flags_test_all_mask(&vma->flags, flags); @@ -1256,7 +1287,7 @@ static inline bool vma_test_all_mask(const struct vm_area_struct *vma, * Note: appropriate locks must be held, this function does not acquire them for * you. */ -static inline void vma_set_flags_mask(struct vm_area_struct *vma, +static __always_inline void vma_set_flags_mask(struct vm_area_struct *vma, vma_flags_t flags) { vma_flags_set_mask(&vma->flags, flags); @@ -1286,7 +1317,7 @@ static __always_inline bool vma_desc_test(const struct vm_area_desc *desc, } /* Helper to test any VMA flags in a VMA descriptor. */ -static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, +static __always_inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, vma_flags_t flags) { return vma_flags_test_any_mask(&desc->vma_flags, flags); @@ -1303,7 +1334,7 @@ static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, vma_desc_test_any_mask(desc, mk_vma_flags(__VA_ARGS__)) /* Helper to test all VMA flags in a VMA descriptor. */ -static inline bool vma_desc_test_all_mask(const struct vm_area_desc *desc, +static __always_inline bool vma_desc_test_all_mask(const struct vm_area_desc *desc, vma_flags_t flags) { return vma_flags_test_all_mask(&desc->vma_flags, flags); @@ -1319,7 +1350,7 @@ static inline bool vma_desc_test_all_mask(const struct vm_area_desc *desc, vma_desc_test_all_mask(desc, mk_vma_flags(__VA_ARGS__)) /* Helper to set all VMA flags in a VMA descriptor. */ -static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, +static __always_inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, vma_flags_t flags) { vma_flags_set_mask(&desc->vma_flags, flags); @@ -1336,7 +1367,7 @@ static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, vma_desc_set_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) /* Helper to clear all VMA flags in a VMA descriptor. */ -static inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc, +static __always_inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc, vma_flags_t flags) { vma_flags_clear_mask(&desc->vma_flags, flags); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1da8fb04133f..38fe6b915024 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1087,7 +1087,8 @@ static __always_inline vm_flags_t vma_flags_to_legacy(vma_flags_t flags) * IMPORTANT: This does not overwrite bytes past the first system word. The * caller must account for this. */ -static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value) +static __always_inline void vma_flags_overwrite_word(vma_flags_t *flags, + unsigned long value) { unsigned long *bitmap = flags->__vma_flags; @@ -1114,7 +1115,8 @@ static __always_inline vma_flags_t legacy_to_vma_flags(vm_flags_t flags) * IMPORTANT: This does not overwrite bytes past the first system word. The * caller must account for this. */ -static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value) +static __always_inline void vma_flags_overwrite_word_once(vma_flags_t *flags, + unsigned long value) { unsigned long *bitmap = flags->__vma_flags; @@ -1122,7 +1124,8 @@ static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned lo } /* Update the first system word of VMA flags setting bits, non-atomically. */ -static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) +static __always_inline void vma_flags_set_word(vma_flags_t *flags, + unsigned long value) { unsigned long *bitmap = flags->__vma_flags; @@ -1130,7 +1133,8 @@ static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) } /* Update the first system word of VMA flags clearing bits, non-atomically. */ -static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value) +static __always_inline void vma_flags_clear_word(vma_flags_t *flags, + unsigned long value) { unsigned long *bitmap = flags->__vma_flags; diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index f49af21319ba..f9fe07a8a443 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -764,7 +764,8 @@ static inline bool mm_flags_test(int flag, const struct mm_struct *mm) * IMPORTANT: This does not overwrite bytes past the first system word. The * caller must account for this. */ -static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value) +static __always_inline void vma_flags_overwrite_word(vma_flags_t *flags, + unsigned long value) { unsigned long *bitmap = flags->__vma_flags; @@ -777,7 +778,8 @@ static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long va * IMPORTANT: This does not overwrite bytes past the first system word. The * caller must account for this. */ -static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value) +static __always_inline void vma_flags_overwrite_word_once(vma_flags_t *flags, + unsigned long value) { unsigned long *bitmap = flags->__vma_flags; @@ -785,7 +787,8 @@ static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned lo } /* Update the first system word of VMA flags setting bits, non-atomically. */ -static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) +static __always_inline void vma_flags_set_word(vma_flags_t *flags, + unsigned long value) { unsigned long *bitmap = flags->__vma_flags; @@ -793,7 +796,8 @@ static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) } /* Update the first system word of VMA flags clearing bits, non-atomically. */ -static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value) +static __always_inline void vma_flags_clear_word(vma_flags_t *flags, + unsigned long value) { unsigned long *bitmap = flags->__vma_flags; @@ -1003,23 +1007,32 @@ static __always_inline bool vma_flags_same_mask(const vma_flags_t *flags, #define vma_flags_same(flags, ...) \ vma_flags_same_mask(flags, mk_vma_flags(__VA_ARGS__)) -static inline bool vma_test_all_mask(const struct vm_area_struct *vma, - vma_flags_t flags) +static __always_inline bool vma_test(const struct vm_area_struct *vma, + vma_flag_t bit) { - return vma_flags_test_all_mask(&vma->flags, flags); + return vma_flags_test(&vma->flags, bit); } -#define vma_test_all(vma, ...) \ - vma_test_all_mask(vma, mk_vma_flags(__VA_ARGS__)) +static __always_inline bool vma_test_any_mask(const struct vm_area_struct *vma, + vma_flags_t flags) +{ + return vma_flags_test_any_mask(&vma->flags, flags); +} -static inline bool is_shared_maywrite_vm_flags(vm_flags_t vm_flags) +#define vma_test_any(vma, ...) \ + vma_test_any_mask(vma, mk_vma_flags(__VA_ARGS__)) + +static __always_inline bool vma_test_all_mask(const struct vm_area_struct *vma, + vma_flags_t flags) { - return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == - (VM_SHARED | VM_MAYWRITE); + return vma_flags_test_all_mask(&vma->flags, flags); } -static inline void vma_set_flags_mask(struct vm_area_struct *vma, - vma_flags_t flags) +#define vma_test_all(vma, ...) \ + vma_test_all_mask(vma, mk_vma_flags(__VA_ARGS__)) + +static __always_inline void vma_set_flags_mask(struct vm_area_struct *vma, + vma_flags_t flags) { vma_flags_set_mask(&vma->flags, flags); } @@ -1033,8 +1046,8 @@ static __always_inline bool vma_desc_test(const struct vm_area_desc *desc, return vma_flags_test(&desc->vma_flags, bit); } -static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, - vma_flags_t flags) +static __always_inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, + vma_flags_t flags) { return vma_flags_test_any_mask(&desc->vma_flags, flags); } @@ -1042,7 +1055,7 @@ static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, #define vma_desc_test_any(desc, ...) \ vma_desc_test_any_mask(desc, mk_vma_flags(__VA_ARGS__)) -static inline bool vma_desc_test_all_mask(const struct vm_area_desc *desc, +static __always_inline bool vma_desc_test_all_mask(const struct vm_area_desc *desc, vma_flags_t flags) { return vma_flags_test_all_mask(&desc->vma_flags, flags); @@ -1051,8 +1064,8 @@ static inline bool vma_desc_test_all_mask(const struct vm_area_desc *desc, #define vma_desc_test_all(desc, ...) \ vma_desc_test_all_mask(desc, mk_vma_flags(__VA_ARGS__)) -static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, - vma_flags_t flags) +static __always_inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, + vma_flags_t flags) { vma_flags_set_mask(&desc->vma_flags, flags); } @@ -1060,8 +1073,8 @@ static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, #define vma_desc_set_flags(desc, ...) \ vma_desc_set_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) -static inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc, - vma_flags_t flags) +static __always_inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc, + vma_flags_t flags) { vma_flags_clear_mask(&desc->vma_flags, flags); } @@ -1069,6 +1082,12 @@ static inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc, #define vma_desc_clear_flags(desc, ...) \ vma_desc_clear_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) +static inline bool is_shared_maywrite_vm_flags(vm_flags_t vm_flags) +{ + return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == + (VM_SHARED | VM_MAYWRITE); +} + static inline bool is_shared_maywrite(const vma_flags_t *flags) { return vma_flags_test_all(flags, VMA_SHARED_BIT, VMA_MAYWRITE_BIT); -- cgit v1.2.3 From e79d1c500f52506b9eab39e81017e30b76f2864d Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:32 +0000 Subject: mm: introduce vma_flags_count() and vma[_flags]_test_single_mask() vma_flags_count() determines how many bits are set in VMA flags, using bitmap_weight(). vma_flags_test_single_mask() determines if a vma_flags_t set of flags contains a single flag specified as another vma_flags_t value, or if the sought flag mask is empty, it is defined to return false. This is useful when we want to declare a VMA flag as optionally a single flag in a mask or empty depending on kernel configuration. This allows us to have VM_NONE-like semantics when checking whether the flag is set. In a subsequent patch, we introduce the use of VMA_DROPPABLE of type vma_flags_t using precisely these semantics. It would be actively confusing to use vma_flags_test_any_single_mask() for this (and vma_flags_test_all_mask() is not correct to use here, as it trivially returns true when tested against an empty vma flags mask). We introduce vma_flags_count() to be able to assert that the compared flag mask is singular or empty, checked when CONFIG_DEBUG_VM is enabled. Also update the VMA tests as part of this change. Link: https://lkml.kernel.org/r/cd778dd02b9f2a01eb54d25a49dea8ec2ddf7753.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 46 ++++++++++++++++++++++++++++++++++++++ tools/testing/vma/include/custom.h | 6 ----- tools/testing/vma/include/dup.h | 21 +++++++++++++++++ tools/testing/vma/vma_internal.h | 6 +++++ 4 files changed, 73 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index f704d7cf2871..de72382efac2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1078,6 +1078,14 @@ static __always_inline vma_flags_t __mk_vma_flags(vma_flags_t flags, #define append_vma_flags(flags, ...) __mk_vma_flags(flags, \ COUNT_ARGS(__VA_ARGS__), (const vma_flag_t []){__VA_ARGS__}) +/* Calculates the number of set bits in the specified VMA flags. */ +static __always_inline int vma_flags_count(const vma_flags_t *flags) +{ + const unsigned long *bitmap = flags->__vma_flags; + + return bitmap_weight(bitmap, NUM_VMA_FLAG_BITS); +} + /* * Test whether a specific VMA flag is set, e.g.: * @@ -1153,6 +1161,26 @@ static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags, #define vma_flags_test_all(flags, ...) \ vma_flags_test_all_mask(flags, mk_vma_flags(__VA_ARGS__)) +/* + * Helper to test that a flag mask of type vma_flags_t has a SINGLE flag set + * (returning false if flagmask has no flags set). + * + * This is defined to make the semantics clearer when testing an optionally + * defined VMA flags mask, e.g.: + * + * if (vma_flags_test_single_mask(&flags, VMA_DROPPABLE)) { ... } + * + * When VMA_DROPPABLE is defined if available, or set to EMPTY_VMA_FLAGS + * otherwise. + */ +static __always_inline bool vma_flags_test_single_mask(const vma_flags_t *flags, + vma_flags_t flagmask) +{ + VM_WARN_ON_ONCE(vma_flags_count(&flagmask) > 1); + + return vma_flags_test_any_mask(flags, flagmask); +} + /* Set each of the to_set flags in flags, non-atomically. */ static __always_inline void vma_flags_set_mask(vma_flags_t *flags, vma_flags_t to_set) @@ -1281,6 +1309,24 @@ static __always_inline bool vma_test_all_mask(const struct vm_area_struct *vma, #define vma_test_all(vma, ...) \ vma_test_all_mask(vma, mk_vma_flags(__VA_ARGS__)) +/* + * Helper to test that a flag mask of type vma_flags_t has a SINGLE flag set + * (returning false if flagmask has no flags set). + * + * This is useful when a flag needs to be either defined or not depending upon + * kernel configuration, e.g.: + * + * if (vma_test_single_mask(vma, VMA_DROPPABLE)) { ... } + * + * When VMA_DROPPABLE is defined if available, or set to EMPTY_VMA_FLAGS + * otherwise. + */ +static __always_inline bool +vma_test_single_mask(const struct vm_area_struct *vma, vma_flags_t flagmask) +{ + return vma_flags_test_single_mask(&vma->flags, flagmask); +} + /* * Helper to set all VMA flags in a VMA. * diff --git a/tools/testing/vma/include/custom.h b/tools/testing/vma/include/custom.h index 2c498e713fbd..b7d9eb0a44e4 100644 --- a/tools/testing/vma/include/custom.h +++ b/tools/testing/vma/include/custom.h @@ -15,12 +15,6 @@ extern unsigned long dac_mmap_min_addr; #define dac_mmap_min_addr 0UL #endif -#define VM_WARN_ON(_expr) (WARN_ON(_expr)) -#define VM_WARN_ON_ONCE(_expr) (WARN_ON_ONCE(_expr)) -#define VM_WARN_ON_VMG(_expr, _vmg) (WARN_ON(_expr)) -#define VM_BUG_ON(_expr) (BUG_ON(_expr)) -#define VM_BUG_ON_VMA(_expr, _vma) (BUG_ON(_expr)) - #define TASK_SIZE ((1ul << 47)-PAGE_SIZE) /* diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index f9fe07a8a443..244ee02dc21d 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -905,6 +905,13 @@ static __always_inline vma_flags_t __mk_vma_flags(vma_flags_t flags, #define append_vma_flags(flags, ...) __mk_vma_flags(flags, \ COUNT_ARGS(__VA_ARGS__), (const vma_flag_t []){__VA_ARGS__}) +static __always_inline int vma_flags_count(const vma_flags_t *flags) +{ + const unsigned long *bitmap = flags->__vma_flags; + + return bitmap_weight(bitmap, NUM_VMA_FLAG_BITS); +} + static __always_inline bool vma_flags_test(const vma_flags_t *flags, vma_flag_t bit) { @@ -952,6 +959,14 @@ static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags, #define vma_flags_test_all(flags, ...) \ vma_flags_test_all_mask(flags, mk_vma_flags(__VA_ARGS__)) +static __always_inline bool vma_flags_test_single_mask(const vma_flags_t *flags, + vma_flags_t flagmask) +{ + VM_WARN_ON_ONCE(vma_flags_count(&flagmask) > 1); + + return vma_flags_test_any_mask(flags, flagmask); +} + static __always_inline void vma_flags_set_mask(vma_flags_t *flags, vma_flags_t to_set) { unsigned long *bitmap = flags->__vma_flags; @@ -1031,6 +1046,12 @@ static __always_inline bool vma_test_all_mask(const struct vm_area_struct *vma, #define vma_test_all(vma, ...) \ vma_test_all_mask(vma, mk_vma_flags(__VA_ARGS__)) +static __always_inline bool +vma_test_single_mask(const struct vm_area_struct *vma, vma_flags_t flagmask) +{ + return vma_flags_test_single_mask(&vma->flags, flagmask); +} + static __always_inline void vma_set_flags_mask(struct vm_area_struct *vma, vma_flags_t flags) { diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 0e1121e2ef23..e12ab2c80f95 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -51,6 +51,12 @@ typedef unsigned long pgprotval_t; typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; typedef __bitwise unsigned int vm_fault_t; +#define VM_WARN_ON(_expr) (WARN_ON(_expr)) +#define VM_WARN_ON_ONCE(_expr) (WARN_ON_ONCE(_expr)) +#define VM_WARN_ON_VMG(_expr, _vmg) (WARN_ON(_expr)) +#define VM_BUG_ON(_expr) (BUG_ON(_expr)) +#define VM_BUG_ON_VMA(_expr, _vma) (BUG_ON(_expr)) + #include "include/stubs.h" #include "include/dup.h" #include "include/custom.h" -- cgit v1.2.3 From 3a6455d56bd7c4cfb1ea35ddae052943065e338e Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:34 +0000 Subject: mm: convert do_brk_flags() to use vma_flags_t In order to be able to do this, we need to change VM_DATA_DEFAULT_FLAGS and friends and update the architecture-specific definitions also. We then have to update some KSM logic to handle VMA flags, and introduce VMA_STACK_FLAGS to define the vma_flags_t equivalent of VM_STACK_FLAGS. We also introduce two helper functions for use during the time we are converting legacy flags to vma_flags_t values - vma_flags_to_legacy() and legacy_to_vma_flags(). This enables us to iteratively make changes to break these changes up into separate parts. We use these explicitly here to keep VM_STACK_FLAGS around for certain users which need to maintain the legacy vm_flags_t values for the time being. We are no longer able to rely on the simple VM_xxx being set to zero if the feature is not enabled, so in the case of VM_DROPPABLE we introduce VMA_DROPPABLE as the vma_flags_t equivalent, which is set to EMPTY_VMA_FLAGS if the droppable flag is not available. While we're here, we make the description of do_brk_flags() into a kdoc comment, as it almost was already. We use vma_flags_to_legacy() to not need to update the vm_get_page_prot() logic as this time. Note that in create_init_stack_vma() we have to replace the BUILD_BUG_ON() with a VM_WARN_ON_ONCE() as the tested values are no longer build time available. We also update mprotect_fixup() to use VMA flags where possible, though we have to live with a little duplication between vm_flags_t and vma_flags_t values for the time being until further conversions are made. While we're here, update VM_SPECIAL to be defined in terms of VMA_SPECIAL_FLAGS now we have vma_flags_to_legacy(). Finally, we update the VMA tests to reflect these changes. Link: https://lkml.kernel.org/r/d02e3e45d9a33d7904b149f5604904089fd640ae.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Paul Moore [SELinux] Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- arch/arc/include/asm/page.h | 2 +- arch/arm/include/asm/page.h | 2 +- arch/arm64/include/asm/page.h | 7 +++++- arch/hexagon/include/asm/page.h | 2 +- arch/loongarch/include/asm/page.h | 2 +- arch/mips/include/asm/page.h | 2 +- arch/nios2/include/asm/page.h | 2 +- arch/powerpc/include/asm/page.h | 4 ++-- arch/powerpc/include/asm/page_32.h | 2 +- arch/powerpc/include/asm/page_64.h | 12 +++++----- arch/riscv/include/asm/page.h | 2 +- arch/s390/include/asm/page.h | 2 +- arch/x86/include/asm/page_types.h | 2 +- arch/x86/um/asm/vm-flags.h | 4 ++-- include/linux/ksm.h | 10 ++++---- include/linux/mm.h | 49 +++++++++++++++++++++++--------------- mm/internal.h | 3 +++ mm/ksm.c | 43 +++++++++++++++++---------------- mm/mmap.c | 13 ++++++---- mm/mprotect.c | 46 +++++++++++++++++++++-------------- mm/mremap.c | 6 ++--- mm/vma.c | 34 ++++++++++++++------------ mm/vma.h | 14 ++++++++--- mm/vma_exec.c | 5 ++-- security/selinux/hooks.c | 4 +++- tools/testing/vma/include/custom.h | 3 --- tools/testing/vma/include/dup.h | 42 +++++++++++++++++--------------- tools/testing/vma/include/stubs.h | 9 +++---- tools/testing/vma/tests/merge.c | 3 +-- 29 files changed, 191 insertions(+), 140 deletions(-) (limited to 'include') diff --git a/arch/arc/include/asm/page.h b/arch/arc/include/asm/page.h index 38214e126c6d..facc7a03b250 100644 --- a/arch/arc/include/asm/page.h +++ b/arch/arc/include/asm/page.h @@ -131,7 +131,7 @@ static inline unsigned long virt_to_pfn(const void *kaddr) #define virt_addr_valid(kaddr) pfn_valid(virt_to_pfn(kaddr)) /* Default Permissions for stack/heaps pages (Non Executable) */ -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_NON_EXEC #define WANT_PAGE_VIRTUAL 1 diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h index ef11b721230e..fa4c1225dde5 100644 --- a/arch/arm/include/asm/page.h +++ b/arch/arm/include/asm/page.h @@ -184,7 +184,7 @@ extern int pfn_valid(unsigned long); #include -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_TSK_EXEC #include #include diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h index b39cc1127e1f..e25d0d18f6d7 100644 --- a/arch/arm64/include/asm/page.h +++ b/arch/arm64/include/asm/page.h @@ -46,7 +46,12 @@ int pfn_is_map_memory(unsigned long pfn); #endif /* !__ASSEMBLER__ */ -#define VM_DATA_DEFAULT_FLAGS (VM_DATA_FLAGS_TSK_EXEC | VM_MTE_ALLOWED) +#ifdef CONFIG_ARM64_MTE +#define VMA_DATA_DEFAULT_FLAGS append_vma_flags(VMA_DATA_FLAGS_TSK_EXEC, \ + VMA_MTE_ALLOWED_BIT) +#else +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_TSK_EXEC +#endif #include diff --git a/arch/hexagon/include/asm/page.h b/arch/hexagon/include/asm/page.h index f0aed3ed812b..6d82572a7f21 100644 --- a/arch/hexagon/include/asm/page.h +++ b/arch/hexagon/include/asm/page.h @@ -90,7 +90,7 @@ struct page; #define virt_to_page(kaddr) pfn_to_page(PFN_DOWN(__pa(kaddr))) /* Default vm area behavior is non-executable. */ -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_NON_EXEC #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) diff --git a/arch/loongarch/include/asm/page.h b/arch/loongarch/include/asm/page.h index 327bf0bc92bf..79235f4fc399 100644 --- a/arch/loongarch/include/asm/page.h +++ b/arch/loongarch/include/asm/page.h @@ -104,7 +104,7 @@ struct page *tlb_virt_to_page(unsigned long kaddr); extern int __virt_addr_valid(volatile void *kaddr); #define virt_addr_valid(kaddr) __virt_addr_valid((volatile void *)(kaddr)) -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_TSK_EXEC #include #include diff --git a/arch/mips/include/asm/page.h b/arch/mips/include/asm/page.h index 5ec428fcc887..50a382a0d8f6 100644 --- a/arch/mips/include/asm/page.h +++ b/arch/mips/include/asm/page.h @@ -213,7 +213,7 @@ extern bool __virt_addr_valid(const volatile void *kaddr); #define virt_addr_valid(kaddr) \ __virt_addr_valid((const volatile void *) (kaddr)) -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_TSK_EXEC extern unsigned long __kaslr_offset; static inline unsigned long kaslr_offset(void) diff --git a/arch/nios2/include/asm/page.h b/arch/nios2/include/asm/page.h index 722956ac0bf8..71eb7c1b67d4 100644 --- a/arch/nios2/include/asm/page.h +++ b/arch/nios2/include/asm/page.h @@ -85,7 +85,7 @@ extern struct page *mem_map; # define virt_to_page(vaddr) pfn_to_page(PFN_DOWN(virt_to_phys(vaddr))) # define virt_addr_valid(vaddr) pfn_valid(PFN_DOWN(virt_to_phys(vaddr))) -# define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC +# define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_NON_EXEC #include diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index f2bb1f98eebe..281f25e071a3 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -240,8 +240,8 @@ static inline const void *pfn_to_kaddr(unsigned long pfn) * and needs to be executable. This means the whole heap ends * up being executable. */ -#define VM_DATA_DEFAULT_FLAGS32 VM_DATA_FLAGS_TSK_EXEC -#define VM_DATA_DEFAULT_FLAGS64 VM_DATA_FLAGS_NON_EXEC +#define VMA_DATA_DEFAULT_FLAGS32 VMA_DATA_FLAGS_TSK_EXEC +#define VMA_DATA_DEFAULT_FLAGS64 VMA_DATA_FLAGS_NON_EXEC #ifdef __powerpc64__ #include diff --git a/arch/powerpc/include/asm/page_32.h b/arch/powerpc/include/asm/page_32.h index 25482405a811..1fd8c21f0a42 100644 --- a/arch/powerpc/include/asm/page_32.h +++ b/arch/powerpc/include/asm/page_32.h @@ -10,7 +10,7 @@ #endif #endif -#define VM_DATA_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS32 +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_DEFAULT_FLAGS32 #if defined(CONFIG_PPC_256K_PAGES) || \ (defined(CONFIG_PPC_8xx) && defined(CONFIG_PPC_16K_PAGES)) diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h index 0f564a06bf68..d96c984d023b 100644 --- a/arch/powerpc/include/asm/page_64.h +++ b/arch/powerpc/include/asm/page_64.h @@ -84,9 +84,9 @@ extern u64 ppc64_pft_size; #endif /* __ASSEMBLER__ */ -#define VM_DATA_DEFAULT_FLAGS \ +#define VMA_DATA_DEFAULT_FLAGS \ (is_32bit_task() ? \ - VM_DATA_DEFAULT_FLAGS32 : VM_DATA_DEFAULT_FLAGS64) + VMA_DATA_DEFAULT_FLAGS32 : VMA_DATA_DEFAULT_FLAGS64) /* * This is the default if a program doesn't have a PT_GNU_STACK @@ -94,12 +94,12 @@ extern u64 ppc64_pft_size; * stack by default, so in the absence of a PT_GNU_STACK program header * we turn execute permission off. */ -#define VM_STACK_DEFAULT_FLAGS32 VM_DATA_FLAGS_EXEC -#define VM_STACK_DEFAULT_FLAGS64 VM_DATA_FLAGS_NON_EXEC +#define VMA_STACK_DEFAULT_FLAGS32 VMA_DATA_FLAGS_EXEC +#define VMA_STACK_DEFAULT_FLAGS64 VMA_DATA_FLAGS_NON_EXEC -#define VM_STACK_DEFAULT_FLAGS \ +#define VMA_STACK_DEFAULT_FLAGS \ (is_32bit_task() ? \ - VM_STACK_DEFAULT_FLAGS32 : VM_STACK_DEFAULT_FLAGS64) + VMA_STACK_DEFAULT_FLAGS32 : VMA_STACK_DEFAULT_FLAGS64) #include diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 187aad0a7b03..c78017061b17 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -204,7 +204,7 @@ static __always_inline void *pfn_to_kaddr(unsigned long pfn) (unsigned long)(_addr) >= PAGE_OFFSET && pfn_valid(virt_to_pfn(_addr)); \ }) -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_NON_EXEC #include #include diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index f339258135f7..56da819a79e6 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -277,7 +277,7 @@ static inline unsigned long virt_to_pfn(const void *kaddr) #define virt_addr_valid(kaddr) pfn_valid(phys_to_pfn(__pa_nodebug((unsigned long)(kaddr)))) -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_NON_EXEC #endif /* !__ASSEMBLER__ */ diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 018a8d906ca3..3e0801a0f782 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -26,7 +26,7 @@ #define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_TSK_EXEC /* Physical address where kernel should be loaded. */ #define LOAD_PHYSICAL_ADDR __ALIGN_KERNEL_MASK(CONFIG_PHYSICAL_START, CONFIG_PHYSICAL_ALIGN - 1) diff --git a/arch/x86/um/asm/vm-flags.h b/arch/x86/um/asm/vm-flags.h index df7a3896f5dd..622d36d6ddff 100644 --- a/arch/x86/um/asm/vm-flags.h +++ b/arch/x86/um/asm/vm-flags.h @@ -9,11 +9,11 @@ #ifdef CONFIG_X86_32 -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_TSK_EXEC #else -#define VM_STACK_DEFAULT_FLAGS (VM_GROWSDOWN | VM_DATA_FLAGS_EXEC) +#define VMA_STACK_DEFAULT_FLAGS append_vma_flags(VMA_DATA_FLAGS_EXEC, VMA_GROWSDOWN_BIT) #endif #endif diff --git a/include/linux/ksm.h b/include/linux/ksm.h index c982694c987b..d39d0d5483a2 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -17,8 +17,8 @@ #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, vm_flags_t *vm_flags); -vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, - vm_flags_t vm_flags); +vma_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, + vma_flags_t vma_flags); int ksm_enable_merge_any(struct mm_struct *mm); int ksm_disable_merge_any(struct mm_struct *mm); int ksm_disable(struct mm_struct *mm); @@ -103,10 +103,10 @@ bool ksm_process_mergeable(struct mm_struct *mm); #else /* !CONFIG_KSM */ -static inline vm_flags_t ksm_vma_flags(struct mm_struct *mm, - const struct file *file, vm_flags_t vm_flags) +static inline vma_flags_t ksm_vma_flags(struct mm_struct *mm, + const struct file *file, vma_flags_t vma_flags) { - return vm_flags; + return vma_flags; } static inline int ksm_disable(struct mm_struct *mm) diff --git a/include/linux/mm.h b/include/linux/mm.h index de72382efac2..4042a584671e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -346,9 +346,9 @@ enum { * if KVM does not lock down the memory type. */ DECLARE_VMA_BIT(ALLOW_ANY_UNCACHED, 39), -#ifdef CONFIG_PPC32 +#if defined(CONFIG_PPC32) DECLARE_VMA_BIT_ALIAS(DROPPABLE, ARCH_1), -#else +#elif defined(CONFIG_64BIT) DECLARE_VMA_BIT(DROPPABLE, 40), #endif DECLARE_VMA_BIT(UFFD_MINOR, 41), @@ -503,31 +503,42 @@ enum { #endif #if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) #define VM_DROPPABLE INIT_VM_FLAG(DROPPABLE) +#define VMA_DROPPABLE mk_vma_flags(VMA_DROPPABLE_BIT) #else #define VM_DROPPABLE VM_NONE +#define VMA_DROPPABLE EMPTY_VMA_FLAGS #endif /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) -#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) +#define TASK_EXEC_BIT ((current->personality & READ_IMPLIES_EXEC) ? \ + VMA_EXEC_BIT : VMA_READ_BIT) /* Common data flag combinations */ -#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) -#define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \ - VM_MAYWRITE | VM_MAYEXEC) -#define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \ - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) - -#ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */ -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC +#define VMA_DATA_FLAGS_TSK_EXEC mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, \ + TASK_EXEC_BIT, VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT, \ + VMA_MAYEXEC_BIT) +#define VMA_DATA_FLAGS_NON_EXEC mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, \ + VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT, VMA_MAYEXEC_BIT) +#define VMA_DATA_FLAGS_EXEC mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, \ + VMA_EXEC_BIT, VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT, \ + VMA_MAYEXEC_BIT) + +#ifndef VMA_DATA_DEFAULT_FLAGS /* arch can override this */ +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_EXEC #endif -#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ -#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS +#ifndef VMA_STACK_DEFAULT_FLAGS /* arch can override this */ +#define VMA_STACK_DEFAULT_FLAGS VMA_DATA_DEFAULT_FLAGS #endif +#define VMA_STACK_FLAGS append_vma_flags(VMA_STACK_DEFAULT_FLAGS, \ + VMA_STACK_BIT, VMA_ACCOUNT_BIT) + +/* Temporary until VMA flags conversion complete. */ +#define VM_STACK_FLAGS vma_flags_to_legacy(VMA_STACK_FLAGS) + #define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) #ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS @@ -536,8 +547,6 @@ enum { #define VM_SEALED_SYSMAP VM_NONE #endif -#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) - /* VMA basic access permission flags */ #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) #define VMA_ACCESS_FLAGS mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT) @@ -545,7 +554,10 @@ enum { /* * Special vmas that are non-mergable, non-mlock()able. */ -#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) + +#define VMA_SPECIAL_FLAGS mk_vma_flags(VMA_IO_BIT, VMA_DONTEXPAND_BIT, \ + VMA_PFNMAP_BIT, VMA_MIXEDMAP_BIT) +#define VM_SPECIAL vma_flags_to_legacy(VMA_SPECIAL_FLAGS) /* * Physically remapped pages are special. Tell the @@ -1407,7 +1419,7 @@ static __always_inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, * vm_area_desc object describing a proposed VMA, e.g.: * * vma_desc_set_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT, - * VMA_DONTDUMP_BIT); + * VMA_DONTDUMP_BIT); */ #define vma_desc_set_flags(desc, ...) \ vma_desc_set_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) @@ -4045,7 +4057,6 @@ extern int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern struct file *get_mm_exe_file(struct mm_struct *mm); extern struct file *get_task_exe_file(struct task_struct *task); -extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages); extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages); extern bool vma_is_special_mapping(const struct vm_area_struct *vma, diff --git a/mm/internal.h b/mm/internal.h index 3d3fa35e5fd1..ce954bab8a37 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1916,4 +1916,7 @@ static inline int get_sysctl_max_map_count(void) return READ_ONCE(sysctl_max_map_count); } +bool may_expand_vm(struct mm_struct *mm, const vma_flags_t *vma_flags, + unsigned long npages); + #endif /* __MM_INTERNAL_H */ diff --git a/mm/ksm.c b/mm/ksm.c index 2a2f2f005fc3..7d5b76478f0b 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -735,21 +735,24 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr, return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; } -static bool ksm_compatible(const struct file *file, vm_flags_t vm_flags) +static bool ksm_compatible(const struct file *file, vma_flags_t vma_flags) { - if (vm_flags & (VM_SHARED | VM_MAYSHARE | VM_SPECIAL | - VM_HUGETLB | VM_DROPPABLE)) - return false; /* just ignore the advice */ - + /* Just ignore the advice. */ + if (vma_flags_test_any(&vma_flags, VMA_SHARED_BIT, VMA_MAYSHARE_BIT, + VMA_HUGETLB_BIT)) + return false; + if (vma_flags_test_single_mask(&vma_flags, VMA_DROPPABLE)) + return false; + if (vma_flags_test_any_mask(&vma_flags, VMA_SPECIAL_FLAGS)) + return false; if (file_is_dax(file)) return false; - #ifdef VM_SAO - if (vm_flags & VM_SAO) + if (vma_flags_test(&vma_flags, VMA_SAO_BIT)) return false; #endif #ifdef VM_SPARC_ADI - if (vm_flags & VM_SPARC_ADI) + if (vma_flags_test(&vma_flags, VMA_SPARC_ADI_BIT)) return false; #endif @@ -758,7 +761,7 @@ static bool ksm_compatible(const struct file *file, vm_flags_t vm_flags) static bool vma_ksm_compatible(struct vm_area_struct *vma) { - return ksm_compatible(vma->vm_file, vma->vm_flags); + return ksm_compatible(vma->vm_file, vma->flags); } static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm, @@ -2825,17 +2828,17 @@ static int ksm_scan_thread(void *nothing) return 0; } -static bool __ksm_should_add_vma(const struct file *file, vm_flags_t vm_flags) +static bool __ksm_should_add_vma(const struct file *file, vma_flags_t vma_flags) { - if (vm_flags & VM_MERGEABLE) + if (vma_flags_test(&vma_flags, VMA_MERGEABLE_BIT)) return false; - return ksm_compatible(file, vm_flags); + return ksm_compatible(file, vma_flags); } static void __ksm_add_vma(struct vm_area_struct *vma) { - if (__ksm_should_add_vma(vma->vm_file, vma->vm_flags)) + if (__ksm_should_add_vma(vma->vm_file, vma->flags)) vm_flags_set(vma, VM_MERGEABLE); } @@ -2860,16 +2863,16 @@ static int __ksm_del_vma(struct vm_area_struct *vma) * * @mm: Proposed VMA's mm_struct * @file: Proposed VMA's file-backed mapping, if any. - * @vm_flags: Proposed VMA"s flags. + * @vma_flags: Proposed VMA"s flags. * - * Returns: @vm_flags possibly updated to mark mergeable. + * Returns: @vma_flags possibly updated to mark mergeable. */ -vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, - vm_flags_t vm_flags) +vma_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, + vma_flags_t vma_flags) { if (mm_flags_test(MMF_VM_MERGE_ANY, mm) && - __ksm_should_add_vma(file, vm_flags)) { - vm_flags |= VM_MERGEABLE; + __ksm_should_add_vma(file, vma_flags)) { + vma_flags_set(&vma_flags, VMA_MERGEABLE_BIT); /* * Generally, the flags here always include MMF_VM_MERGEABLE. * However, in rare cases, this flag may be cleared by ksmd who @@ -2879,7 +2882,7 @@ vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, __ksm_enter(mm); } - return vm_flags; + return vma_flags; } static void ksm_add_vmas(struct mm_struct *mm) diff --git a/mm/mmap.c b/mm/mmap.c index 2d2b814978bf..5754d1c36462 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -192,7 +192,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) brkvma = vma_prev_limit(&vmi, mm->start_brk); /* Ok, looks good - let it rip. */ - if (do_brk_flags(&vmi, brkvma, oldbrk, newbrk - oldbrk, 0) < 0) + if (do_brk_flags(&vmi, brkvma, oldbrk, newbrk - oldbrk, + EMPTY_VMA_FLAGS) < 0) goto out; mm->brk = brk; @@ -1203,7 +1204,8 @@ out: int vm_brk_flags(unsigned long addr, unsigned long request, bool is_exec) { - const vm_flags_t vm_flags = is_exec ? VM_EXEC : 0; + const vma_flags_t vma_flags = is_exec ? + mk_vma_flags(VMA_EXEC_BIT) : EMPTY_VMA_FLAGS; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; unsigned long len; @@ -1230,7 +1232,7 @@ int vm_brk_flags(unsigned long addr, unsigned long request, bool is_exec) goto munmap_failed; vma = vma_prev(&vmi); - ret = do_brk_flags(&vmi, vma, addr, len, vm_flags); + ret = do_brk_flags(&vmi, vma, addr, len, vma_flags); populate = ((mm->def_flags & VM_LOCKED) != 0); mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); @@ -1328,12 +1330,13 @@ destroy: * Return true if the calling process may expand its vm space by the passed * number of pages */ -bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages) +bool may_expand_vm(struct mm_struct *mm, const vma_flags_t *vma_flags, + unsigned long npages) { if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT) return false; - if (is_data_mapping(flags) && + if (is_data_mapping_vma_flags(vma_flags) && mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) { /* Workaround for Valgrind */ if (rlimit(RLIMIT_DATA) == 0 && diff --git a/mm/mprotect.c b/mm/mprotect.c index 9681f055b9fc..eaa724b99908 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -697,7 +697,8 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, unsigned long start, unsigned long end, vm_flags_t newflags) { struct mm_struct *mm = vma->vm_mm; - vm_flags_t oldflags = READ_ONCE(vma->vm_flags); + const vma_flags_t old_vma_flags = READ_ONCE(vma->flags); + vma_flags_t new_vma_flags = legacy_to_vma_flags(newflags); long nrpages = (end - start) >> PAGE_SHIFT; unsigned int mm_cp_flags = 0; unsigned long charged = 0; @@ -706,7 +707,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, if (vma_is_sealed(vma)) return -EPERM; - if (newflags == oldflags) { + if (vma_flags_same_pair(&old_vma_flags, &new_vma_flags)) { *pprev = vma; return 0; } @@ -717,8 +718,9 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, * uncommon case, so doesn't need to be very optimized. */ if (arch_has_pfn_modify_check() && - (oldflags & (VM_PFNMAP|VM_MIXEDMAP)) && - (newflags & VM_ACCESS_FLAGS) == 0) { + vma_flags_test_any(&old_vma_flags, VMA_PFNMAP_BIT, + VMA_MIXEDMAP_BIT) && + !vma_flags_test_any_mask(&new_vma_flags, VMA_ACCESS_FLAGS)) { pgprot_t new_pgprot = vm_get_page_prot(newflags); error = walk_page_range(current->mm, start, end, @@ -736,28 +738,31 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, * hugetlb mapping were accounted for even if read-only so there is * no need to account for them here. */ - if (newflags & VM_WRITE) { + if (vma_flags_test(&new_vma_flags, VMA_WRITE_BIT)) { /* Check space limits when area turns into data. */ - if (!may_expand_vm(mm, newflags, nrpages) && - may_expand_vm(mm, oldflags, nrpages)) + if (!may_expand_vm(mm, &new_vma_flags, nrpages) && + may_expand_vm(mm, &old_vma_flags, nrpages)) return -ENOMEM; - if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB| - VM_SHARED|VM_NORESERVE))) { + if (!vma_flags_test_any(&old_vma_flags, + VMA_ACCOUNT_BIT, VMA_WRITE_BIT, VMA_HUGETLB_BIT, + VMA_SHARED_BIT, VMA_NORESERVE_BIT)) { charged = nrpages; if (security_vm_enough_memory_mm(mm, charged)) return -ENOMEM; - newflags |= VM_ACCOUNT; + vma_flags_set(&new_vma_flags, VMA_ACCOUNT_BIT); } - } else if ((oldflags & VM_ACCOUNT) && vma_is_anonymous(vma) && - !vma->anon_vma) { - newflags &= ~VM_ACCOUNT; + } else if (vma_flags_test(&old_vma_flags, VMA_ACCOUNT_BIT) && + vma_is_anonymous(vma) && !vma->anon_vma) { + vma_flags_clear(&new_vma_flags, VMA_ACCOUNT_BIT); } + newflags = vma_flags_to_legacy(new_vma_flags); vma = vma_modify_flags(vmi, *pprev, vma, start, end, &newflags); if (IS_ERR(vma)) { error = PTR_ERR(vma); goto fail; } + new_vma_flags = legacy_to_vma_flags(newflags); *pprev = vma; @@ -773,19 +778,24 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, change_protection(tlb, vma, start, end, mm_cp_flags); - if ((oldflags & VM_ACCOUNT) && !(newflags & VM_ACCOUNT)) + if (vma_flags_test(&old_vma_flags, VMA_ACCOUNT_BIT) && + !vma_flags_test(&new_vma_flags, VMA_ACCOUNT_BIT)) vm_unacct_memory(nrpages); /* * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major * fault on access. */ - if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED && - (newflags & VM_WRITE)) { - populate_vma_page_range(vma, start, end, NULL); + if (vma_flags_test(&new_vma_flags, VMA_WRITE_BIT)) { + const vma_flags_t mask = + vma_flags_and(&old_vma_flags, VMA_WRITE_BIT, + VMA_SHARED_BIT, VMA_LOCKED_BIT); + + if (vma_flags_same(&mask, VMA_LOCKED_BIT)) + populate_vma_page_range(vma, start, end, NULL); } - vm_stat_account(mm, oldflags, -nrpages); + vm_stat_account(mm, vma_flags_to_legacy(old_vma_flags), -nrpages); vm_stat_account(mm, newflags, nrpages); perf_event_mmap(vma); return 0; diff --git a/mm/mremap.c b/mm/mremap.c index 36b3f1caebad..e9c8b1d05832 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1472,10 +1472,10 @@ static unsigned long mremap_to(struct vma_remap_struct *vrm) /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */ if (vrm->flags & MREMAP_DONTUNMAP) { - vm_flags_t vm_flags = vrm->vma->vm_flags; + vma_flags_t vma_flags = vrm->vma->flags; unsigned long pages = vrm->old_len >> PAGE_SHIFT; - if (!may_expand_vm(mm, vm_flags, pages)) + if (!may_expand_vm(mm, &vma_flags, pages)) return -ENOMEM; } @@ -1813,7 +1813,7 @@ static int check_prep_vma(struct vma_remap_struct *vrm) if (!mlock_future_ok(mm, vma->vm_flags & VM_LOCKED, vrm->delta)) return -EAGAIN; - if (!may_expand_vm(mm, vma->vm_flags, vrm->delta >> PAGE_SHIFT)) + if (!may_expand_vm(mm, &vma->flags, vrm->delta >> PAGE_SHIFT)) return -ENOMEM; return 0; diff --git a/mm/vma.c b/mm/vma.c index 6af26619e020..9362860389ae 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -2385,7 +2385,7 @@ static void vms_abort_munmap_vmas(struct vma_munmap_struct *vms, static void update_ksm_flags(struct mmap_state *map) { - map->vm_flags = ksm_vma_flags(map->mm, map->file, map->vm_flags); + map->vma_flags = ksm_vma_flags(map->mm, map->file, map->vma_flags); } static void set_desc_from_map(struct vm_area_desc *desc, @@ -2446,7 +2446,7 @@ static int __mmap_setup(struct mmap_state *map, struct vm_area_desc *desc, } /* Check against address space limit. */ - if (!may_expand_vm(map->mm, map->vm_flags, map->pglen - vms->nr_pages)) + if (!may_expand_vm(map->mm, &map->vma_flags, map->pglen - vms->nr_pages)) return -ENOMEM; /* Private writable mapping: check memory availability. */ @@ -2866,20 +2866,22 @@ unsigned long mmap_region(struct file *file, unsigned long addr, return ret; } -/* +/** * do_brk_flags() - Increase the brk vma if the flags match. * @vmi: The vma iterator * @addr: The start address * @len: The length of the increase * @vma: The vma, - * @vm_flags: The VMA Flags + * @vma_flags: The VMA Flags * * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags * do not match then create a new anonymous VMA. Eventually we may be able to * do some brk-specific accounting here. + * + * Returns: %0 on success, or otherwise an error. */ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long addr, unsigned long len, vm_flags_t vm_flags) + unsigned long addr, unsigned long len, vma_flags_t vma_flags) { struct mm_struct *mm = current->mm; @@ -2887,9 +2889,12 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, * Check against address space limits by the changed size * Note: This happens *after* clearing old mappings in some code paths. */ - vm_flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - vm_flags = ksm_vma_flags(mm, NULL, vm_flags); - if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) + vma_flags_set_mask(&vma_flags, VMA_DATA_DEFAULT_FLAGS); + vma_flags_set(&vma_flags, VMA_ACCOUNT_BIT); + vma_flags_set_mask(&vma_flags, mm->def_vma_flags); + + vma_flags = ksm_vma_flags(mm, NULL, vma_flags); + if (!may_expand_vm(mm, &vma_flags, len >> PAGE_SHIFT)) return -ENOMEM; if (mm->map_count > get_sysctl_max_map_count()) @@ -2903,7 +2908,7 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, * occur after forking, so the expand will only happen on new VMAs. */ if (vma && vma->vm_end == addr) { - VMG_STATE(vmg, mm, vmi, addr, addr + len, vm_flags, PHYS_PFN(addr)); + VMG_STATE(vmg, mm, vmi, addr, addr + len, vma_flags, PHYS_PFN(addr)); vmg.prev = vma; /* vmi is positioned at prev, which this mode expects. */ @@ -2924,8 +2929,8 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_set_anonymous(vma); vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT); - vm_flags_init(vma, vm_flags); - vma->vm_page_prot = vm_get_page_prot(vm_flags); + vma->flags = vma_flags; + vma->vm_page_prot = vm_get_page_prot(vma_flags_to_legacy(vma_flags)); vma_start_write(vma); if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) goto mas_store_fail; @@ -2936,10 +2941,10 @@ out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; mm->data_vm += len >> PAGE_SHIFT; - if (vm_flags & VM_LOCKED) + if (vma_flags_test(&vma_flags, VMA_LOCKED_BIT)) mm->locked_vm += (len >> PAGE_SHIFT); if (pgtable_supports_soft_dirty()) - vm_flags_set(vma, VM_SOFTDIRTY); + vma_set_flags(vma, VMA_SOFTDIRTY_BIT); return 0; mas_store_fail: @@ -3070,7 +3075,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long new_start; /* address space limit tests */ - if (!may_expand_vm(mm, vma->vm_flags, grow)) + if (!may_expand_vm(mm, &vma->flags, grow)) return -ENOMEM; /* Stack limit test */ @@ -3289,7 +3294,6 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { unsigned long charged = vma_pages(vma); - if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) return -ENOMEM; diff --git a/mm/vma.h b/mm/vma.h index cf8926558bf6..1f2de6cb3b97 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -237,13 +237,13 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start); } -#define VMG_STATE(name, mm_, vmi_, start_, end_, vm_flags_, pgoff_) \ +#define VMG_STATE(name, mm_, vmi_, start_, end_, vma_flags_, pgoff_) \ struct vma_merge_struct name = { \ .mm = mm_, \ .vmi = vmi_, \ .start = start_, \ .end = end_, \ - .vm_flags = vm_flags_, \ + .vma_flags = vma_flags_, \ .pgoff = pgoff_, \ .state = VMA_MERGE_START, \ } @@ -465,7 +465,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, struct list_head *uf); int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma, - unsigned long addr, unsigned long request, unsigned long flags); + unsigned long addr, unsigned long request, + vma_flags_t vma_flags); unsigned long unmapped_area(struct vm_unmapped_area_info *info); unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info); @@ -527,6 +528,13 @@ static inline bool is_data_mapping(vm_flags_t flags) return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE; } +static inline bool is_data_mapping_vma_flags(const vma_flags_t *vma_flags) +{ + const vma_flags_t mask = vma_flags_and(vma_flags, + VMA_WRITE_BIT, VMA_SHARED_BIT, VMA_STACK_BIT); + + return vma_flags_same(&mask, VMA_WRITE_BIT); +} static inline void vma_iter_config(struct vma_iterator *vmi, unsigned long index, unsigned long last) diff --git a/mm/vma_exec.c b/mm/vma_exec.c index 8134e1afca68..5cee8b7efa0f 100644 --- a/mm/vma_exec.c +++ b/mm/vma_exec.c @@ -36,7 +36,8 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) unsigned long new_start = old_start - shift; unsigned long new_end = old_end - shift; VMA_ITERATOR(vmi, mm, new_start); - VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff); + VMG_STATE(vmg, mm, &vmi, new_start, old_end, EMPTY_VMA_FLAGS, + vma->vm_pgoff); struct vm_area_struct *next; struct mmu_gather tlb; PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length); @@ -135,7 +136,7 @@ int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, * use STACK_TOP because that can depend on attributes which aren't * configured yet. */ - BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); + VM_WARN_ON_ONCE(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; if (pgtable_supports_soft_dirty()) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index d8224ea113d1..903303e084c2 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -7713,6 +7713,8 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = { static __init int selinux_init(void) { + vma_flags_t data_default_flags = VMA_DATA_DEFAULT_FLAGS; + pr_info("SELinux: Initializing.\n"); memset(&selinux_state, 0, sizeof(selinux_state)); @@ -7729,7 +7731,7 @@ static __init int selinux_init(void) AUDIT_CFG_LSM_SECCTX_SUBJECT | AUDIT_CFG_LSM_SECCTX_OBJECT); - default_noexec = !(VM_DATA_DEFAULT_FLAGS & VM_EXEC); + default_noexec = !vma_flags_test(&data_default_flags, VMA_EXEC_BIT); if (!default_noexec) pr_notice("SELinux: virtual memory is executable by default\n"); diff --git a/tools/testing/vma/include/custom.h b/tools/testing/vma/include/custom.h index b7d9eb0a44e4..744fe874c168 100644 --- a/tools/testing/vma/include/custom.h +++ b/tools/testing/vma/include/custom.h @@ -95,6 +95,3 @@ static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) { return PAGE_SIZE; } - -#define VMA_SPECIAL_FLAGS mk_vma_flags(VMA_IO_BIT, VMA_DONTEXPAND_BIT, \ - VMA_PFNMAP_BIT, VMA_MIXEDMAP_BIT) diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 244ee02dc21d..36373b81ad24 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -314,27 +314,33 @@ enum { /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) -#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) +#define TASK_EXEC_BIT ((current->personality & READ_IMPLIES_EXEC) ? \ + VM_EXEC_BIT : VM_READ_BIT) /* Common data flag combinations */ -#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) -#define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \ - VM_MAYWRITE | VM_MAYEXEC) -#define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \ - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) - -#ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */ -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC +#define VMA_DATA_FLAGS_TSK_EXEC mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, \ + TASK_EXEC_BIT, VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT, \ + VMA_MAYEXEC_BIT) +#define VMA_DATA_FLAGS_NON_EXEC mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, \ + VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT, VMA_MAYEXEC_BIT) +#define VMA_DATA_FLAGS_EXEC mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, \ + VMA_EXEC_BIT, VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT, \ + VMA_MAYEXEC_BIT) + +#ifndef VMA_DATA_DEFAULT_FLAGS /* arch can override this */ +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_EXEC #endif -#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ -#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS +#ifndef VMA_STACK_DEFAULT_FLAGS /* arch can override this */ +#define VMA_STACK_DEFAULT_FLAGS VMA_DATA_DEFAULT_FLAGS #endif -#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) +#define VMA_STACK_FLAGS append_vma_flags(VMA_STACK_DEFAULT_FLAGS, \ + VMA_STACK_BIT, VMA_ACCOUNT_BIT) +/* Temporary until VMA flags conversion complete. */ +#define VM_STACK_FLAGS vma_flags_to_legacy(VMA_STACK_FLAGS) -#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) +#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) /* VMA basic access permission flags */ #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) @@ -345,6 +351,9 @@ enum { */ #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) +#define VMA_SPECIAL_FLAGS mk_vma_flags(VMA_IO_BIT, VMA_DONTEXPAND_BIT, \ + VMA_PFNMAP_BIT, VMA_MIXEDMAP_BIT) + #define VMA_REMAP_FLAGS mk_vma_flags(VMA_IO_BIT, VMA_PFNMAP_BIT, \ VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT) @@ -357,11 +366,6 @@ enum { /* This mask represents all the VMA flag bits used by mlock */ #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) -#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) - -#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) - #define RLIMIT_STACK 3 /* max stack size */ #define RLIMIT_MEMLOCK 8 /* max locked-in-memory address space */ diff --git a/tools/testing/vma/include/stubs.h b/tools/testing/vma/include/stubs.h index 416bb93f5005..b5dced3b0bd4 100644 --- a/tools/testing/vma/include/stubs.h +++ b/tools/testing/vma/include/stubs.h @@ -101,10 +101,10 @@ static inline bool shmem_file(struct file *file) return false; } -static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm, - const struct file *file, vm_flags_t vm_flags) +static inline vma_flags_t ksm_vma_flags(struct mm_struct *mm, + const struct file *file, vma_flags_t vma_flags) { - return vm_flags; + return vma_flags; } static inline void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn) @@ -239,7 +239,8 @@ static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages) return 0; } -static inline bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, +static inline bool may_expand_vm(struct mm_struct *mm, + const vma_flags_t *vma_flags, unsigned long npages) { return true; diff --git a/tools/testing/vma/tests/merge.c b/tools/testing/vma/tests/merge.c index d3e725dc0000..44e3977e3fc0 100644 --- a/tools/testing/vma/tests/merge.c +++ b/tools/testing/vma/tests/merge.c @@ -1429,11 +1429,10 @@ static bool test_expand_only_mode(void) { vma_flags_t vma_flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT); - vm_flags_t legacy_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; struct mm_struct mm = {}; VMA_ITERATOR(vmi, &mm, 0); struct vm_area_struct *vma_prev, *vma; - VMG_STATE(vmg, &mm, &vmi, 0x5000, 0x9000, legacy_flags, 5); + VMG_STATE(vmg, &mm, &vmi, 0x5000, 0x9000, vma_flags, 5); /* * Place a VMA prior to the one we're expanding so we assert that we do -- cgit v1.2.3 From d720b81d01b137dfc23e07461b05b76f822af6ab Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:36 +0000 Subject: mm/vma: introduce vma_clear_flags[_mask]() Introduce a helper function and helper macro to easily clear a VMA's flags using the new vma_flags_t vma->flags field: * vma_clear_flags_mask() - Clears all of the flags in a specified mask in the VMA's flags field. * vma_clear_flags() - Clears all of the specified individual VMA flag bits in a VMA's flags field. Also update the VMA tests to reflect the change. Link: https://lkml.kernel.org/r/9bd15da35c2c90e7441265adf01b5c2d3b5c6d41.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 16 ++++++++++++++++ tools/testing/vma/include/dup.h | 9 +++++++++ 2 files changed, 25 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 4042a584671e..6b614f8af045 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1363,6 +1363,22 @@ static __always_inline void vma_set_flags_mask(struct vm_area_struct *vma, #define vma_set_flags(vma, ...) \ vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) +/* Helper to clear all VMA flags in a VMA. */ +static __always_inline void vma_clear_flags_mask(struct vm_area_struct *vma, + vma_flags_t flags) +{ + vma_flags_clear_mask(&vma->flags, flags); +} + +/* + * Helper macro for clearing VMA flags, e.g.: + * + * vma_clear_flags(vma, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT, + * VMA_DONTDUMP_BIT); + */ +#define vma_clear_flags(vma, ...) \ + vma_clear_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) + /* * Test whether a specific VMA flag is set in a VMA descriptor, e.g.: * diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 36373b81ad24..93ea600d0895 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -1065,6 +1065,15 @@ static __always_inline void vma_set_flags_mask(struct vm_area_struct *vma, #define vma_set_flags(vma, ...) \ vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) +static __always_inline void vma_clear_flags_mask(struct vm_area_struct *vma, + vma_flags_t flags) +{ + vma_flags_clear_mask(&vma->flags, flags); +} + +#define vma_clear_flags(vma, ...) \ + vma_clear_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) + static __always_inline bool vma_desc_test(const struct vm_area_desc *desc, vma_flag_t bit) { -- cgit v1.2.3 From 769669bd9ca4cbae2562d57fe753efdcf17a196d Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:38 +0000 Subject: mm/vma: convert as much as we can in mm/vma.c to vma_flags_t Now we have established a good foundation for vm_flags_t to vma_flags_t changes, update mm/vma.c to utilise vma_flags_t wherever possible. We are able to convert VM_STARTGAP_FLAGS entirely as this is only used in mm/vma.c, and to account for the fact we can't use VM_NONE to make life easier, place the definition of this within existing #ifdef's to be cleaner. Generally the remaining changes are mechanical. Also update the VMA tests to reflect the changes. Link: https://lkml.kernel.org/r/5fdeaf8af9a12c2a5d68497495f52fa627d05a5b.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 ++- mm/vma.c | 89 ++++++++++++++++++++++----------------- tools/testing/vma/include/dup.h | 4 ++ tools/testing/vma/include/stubs.h | 2 +- 4 files changed, 59 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6b614f8af045..c6b40dc88918 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -463,8 +463,10 @@ enum { #if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS) || \ defined(CONFIG_RISCV_USER_CFI) #define VM_SHADOW_STACK INIT_VM_FLAG(SHADOW_STACK) +#define VMA_STARTGAP_FLAGS mk_vma_flags(VMA_GROWSDOWN_BIT, VMA_SHADOW_STACK_BIT) #else #define VM_SHADOW_STACK VM_NONE +#define VMA_STARTGAP_FLAGS mk_vma_flags(VMA_GROWSDOWN_BIT) #endif #if defined(CONFIG_PPC64) #define VM_SAO INIT_VM_FLAG(SAO) @@ -539,8 +541,6 @@ enum { /* Temporary until VMA flags conversion complete. */ #define VM_STACK_FLAGS vma_flags_to_legacy(VMA_STACK_FLAGS) -#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) - #ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS #define VM_SEALED_SYSMAP VM_SEALED #else @@ -584,6 +584,8 @@ enum { /* This mask represents all the VMA flag bits used by mlock */ #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) +#define VMA_LOCKED_MASK mk_vma_flags(VMA_LOCKED_BIT, VMA_LOCKONFAULT_BIT) + /* These flags can be updated atomically via VMA/mmap read lock. */ #define VM_ATOMIC_SET_ALLOWED VM_MAYBE_GUARD diff --git a/mm/vma.c b/mm/vma.c index 9362860389ae..9d194f8e7acb 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -185,7 +185,7 @@ static void init_multi_vma_prep(struct vma_prepare *vp, } /* - * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) + * Return true if we can merge this (vma_flags,anon_vma,file,vm_pgoff) * in front of (at a lower virtual address and file offset than) the vma. * * We cannot merge two vmas if they have differently assigned (non-NULL) @@ -211,7 +211,7 @@ static bool can_vma_merge_before(struct vma_merge_struct *vmg) } /* - * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) + * Return true if we can merge this (vma_flags,anon_vma,file,vm_pgoff) * beyond (at a higher virtual address and file offset than) the vma. * * We cannot merge two vmas if they have differently assigned (non-NULL) @@ -850,7 +850,8 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( * furthermost left or right side of the VMA, then we have no chance of * merging and should abort. */ - if (vmg->vm_flags & VM_SPECIAL || (!left_side && !right_side)) + if (vma_flags_test_any_mask(&vmg->vma_flags, VMA_SPECIAL_FLAGS) || + (!left_side && !right_side)) return NULL; if (left_side) @@ -1072,7 +1073,8 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) vmg->state = VMA_MERGE_NOMERGE; /* Special VMAs are unmergeable, also if no prev/next. */ - if ((vmg->vm_flags & VM_SPECIAL) || (!prev && !next)) + if (vma_flags_test_any_mask(&vmg->vma_flags, VMA_SPECIAL_FLAGS) || + (!prev && !next)) return NULL; can_merge_left = can_vma_merge_left(vmg); @@ -1459,17 +1461,17 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, nrpages = vma_pages(next); vms->nr_pages += nrpages; - if (next->vm_flags & VM_LOCKED) + if (vma_test(next, VMA_LOCKED_BIT)) vms->locked_vm += nrpages; - if (next->vm_flags & VM_ACCOUNT) + if (vma_test(next, VMA_ACCOUNT_BIT)) vms->nr_accounted += nrpages; if (is_exec_mapping(next->vm_flags)) vms->exec_vm += nrpages; else if (is_stack_mapping(next->vm_flags)) vms->stack_vm += nrpages; - else if (is_data_mapping(next->vm_flags)) + else if (is_data_mapping_vma_flags(&next->flags)) vms->data_vm += nrpages; if (vms->uf) { @@ -2065,14 +2067,13 @@ static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops) static bool vma_is_shared_writable(struct vm_area_struct *vma) { - return (vma->vm_flags & (VM_WRITE | VM_SHARED)) == - (VM_WRITE | VM_SHARED); + return vma_test_all(vma, VMA_WRITE_BIT, VMA_SHARED_BIT); } static bool vma_fs_can_writeback(struct vm_area_struct *vma) { /* No managed pages to writeback. */ - if (vma->vm_flags & VM_PFNMAP) + if (vma_test(vma, VMA_PFNMAP_BIT)) return false; return vma->vm_file && vma->vm_file->f_mapping && @@ -2338,8 +2339,11 @@ void mm_drop_all_locks(struct mm_struct *mm) * We account for memory if it's a private writeable mapping, * not hugepages and VM_NORESERVE wasn't set. */ -static bool accountable_mapping(struct file *file, vm_flags_t vm_flags) +static bool accountable_mapping(struct mmap_state *map) { + const struct file *file = map->file; + vma_flags_t mask; + /* * hugetlb has its own accounting separate from the core VM * VM_HUGETLB may not be set yet so we cannot check for that flag. @@ -2347,7 +2351,9 @@ static bool accountable_mapping(struct file *file, vm_flags_t vm_flags) if (file && is_file_hugepages(file)) return false; - return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; + mask = vma_flags_and(&map->vma_flags, VMA_NORESERVE_BIT, VMA_SHARED_BIT, + VMA_WRITE_BIT); + return vma_flags_same(&mask, VMA_WRITE_BIT); } /* @@ -2450,7 +2456,7 @@ static int __mmap_setup(struct mmap_state *map, struct vm_area_desc *desc, return -ENOMEM; /* Private writable mapping: check memory availability. */ - if (accountable_mapping(map->file, map->vm_flags)) { + if (accountable_mapping(map)) { map->charged = map->pglen; map->charged -= vms->nr_accounted; if (map->charged) { @@ -2460,7 +2466,7 @@ static int __mmap_setup(struct mmap_state *map, struct vm_area_desc *desc, } vms->nr_accounted = 0; - map->vm_flags |= VM_ACCOUNT; + vma_flags_set(&map->vma_flags, VMA_ACCOUNT_BIT); } /* @@ -2508,12 +2514,12 @@ static int __mmap_new_file_vma(struct mmap_state *map, * Drivers should not permit writability when previously it was * disallowed. */ - VM_WARN_ON_ONCE(map->vm_flags != vma->vm_flags && - !(map->vm_flags & VM_MAYWRITE) && - (vma->vm_flags & VM_MAYWRITE)); + VM_WARN_ON_ONCE(!vma_flags_same_pair(&map->vma_flags, &vma->flags) && + !vma_flags_test(&map->vma_flags, VMA_MAYWRITE_BIT) && + vma_test(vma, VMA_MAYWRITE_BIT)); map->file = vma->vm_file; - map->vm_flags = vma->vm_flags; + map->vma_flags = vma->flags; return 0; } @@ -2544,7 +2550,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) vma_iter_config(vmi, map->addr, map->end); vma_set_range(vma, map->addr, map->end, map->pgoff); - vm_flags_init(vma, map->vm_flags); + vma->flags = map->vma_flags; vma->vm_page_prot = map->page_prot; if (vma_iter_prealloc(vmi, vma)) { @@ -2554,7 +2560,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) if (map->file) error = __mmap_new_file_vma(map, vma); - else if (map->vm_flags & VM_SHARED) + else if (vma_flags_test(&map->vma_flags, VMA_SHARED_BIT)) error = shmem_zero_setup(vma); else vma_set_anonymous(vma); @@ -2564,7 +2570,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) if (!map->check_ksm_early) { update_ksm_flags(map); - vm_flags_init(vma, map->vm_flags); + vma->flags = map->vma_flags; } #ifdef CONFIG_SPARC64 @@ -2604,7 +2610,6 @@ free_vma: static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) { struct mm_struct *mm = map->mm; - vm_flags_t vm_flags = vma->vm_flags; perf_event_mmap(vma); @@ -2612,9 +2617,9 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) vms_complete_munmap_vmas(&map->vms, &map->mas_detach); vm_stat_account(mm, vma->vm_flags, map->pglen); - if (vm_flags & VM_LOCKED) { + if (vma_test(vma, VMA_LOCKED_BIT)) { if (!vma_supports_mlock(vma)) - vm_flags_clear(vma, VM_LOCKED_MASK); + vma_clear_flags_mask(vma, VMA_LOCKED_MASK); else mm->locked_vm += map->pglen; } @@ -2630,7 +2635,7 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) * a completely new data area). */ if (pgtable_supports_soft_dirty()) - vm_flags_set(vma, VM_SOFTDIRTY); + vma_set_flags(vma, VMA_SOFTDIRTY_BIT); vma_set_page_prot(vma); } @@ -2993,7 +2998,8 @@ retry: gap = vma_iter_addr(&vmi) + info->start_gap; gap += (info->align_offset - gap) & info->align_mask; tmp = vma_next(&vmi); - if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ + /* Avoid prev check if possible */ + if (tmp && vma_test_any_mask(tmp, VMA_STARTGAP_FLAGS)) { if (vm_start_gap(tmp) < gap + length - 1) { low_limit = tmp->vm_end; vma_iter_reset(&vmi); @@ -3045,7 +3051,8 @@ retry: gap -= (gap - info->align_offset) & info->align_mask; gap_end = vma_iter_end(&vmi); tmp = vma_next(&vmi); - if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ + /* Avoid prev check if possible */ + if (tmp && vma_test_any_mask(tmp, VMA_STARTGAP_FLAGS)) { if (vm_start_gap(tmp) < gap_end) { high_limit = vm_start_gap(tmp); vma_iter_reset(&vmi); @@ -3083,12 +3090,16 @@ static int acct_stack_growth(struct vm_area_struct *vma, return -ENOMEM; /* mlock limit tests */ - if (!mlock_future_ok(mm, vma->vm_flags & VM_LOCKED, grow << PAGE_SHIFT)) + if (!mlock_future_ok(mm, vma_test(vma, VMA_LOCKED_BIT), + grow << PAGE_SHIFT)) return -ENOMEM; /* Check to ensure the stack will not grow into a hugetlb-only region */ - new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : - vma->vm_end - size; + new_start = vma->vm_end - size; +#ifdef CONFIG_STACK_GROWSUP + if (vma_test(vma, VMA_GROWSUP_BIT)) + new_start = vma->vm_start; +#endif if (is_hugepage_only_range(vma->vm_mm, new_start, size)) return -EFAULT; @@ -3102,7 +3113,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, return 0; } -#if defined(CONFIG_STACK_GROWSUP) +#ifdef CONFIG_STACK_GROWSUP /* * PA-RISC uses this for its stack. * vma is the last one with address > vma->vm_end. Have to extend vma. @@ -3115,7 +3126,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) int error = 0; VMA_ITERATOR(vmi, mm, vma->vm_start); - if (!(vma->vm_flags & VM_GROWSUP)) + if (!vma_test(vma, VMA_GROWSUP_BIT)) return -EFAULT; mmap_assert_write_locked(mm); @@ -3135,7 +3146,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) next = find_vma_intersection(mm, vma->vm_end, gap_addr); if (next && vma_is_accessible(next)) { - if (!(next->vm_flags & VM_GROWSUP)) + if (!vma_test(next, VMA_GROWSUP_BIT)) return -ENOMEM; /* Check that both stack segments have the same anon_vma? */ } @@ -3169,7 +3180,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { - if (vma->vm_flags & VM_LOCKED) + if (vma_test(vma, VMA_LOCKED_BIT)) mm->locked_vm += grow; vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); @@ -3200,7 +3211,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) int error = 0; VMA_ITERATOR(vmi, mm, vma->vm_start); - if (!(vma->vm_flags & VM_GROWSDOWN)) + if (!vma_test(vma, VMA_GROWSDOWN_BIT)) return -EFAULT; mmap_assert_write_locked(mm); @@ -3213,7 +3224,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) prev = vma_prev(&vmi); /* Check that both stack segments have the same anon_vma? */ if (prev) { - if (!(prev->vm_flags & VM_GROWSDOWN) && + if (!vma_test(prev, VMA_GROWSDOWN_BIT) && vma_is_accessible(prev) && (address - prev->vm_end < stack_guard_gap)) return -ENOMEM; @@ -3248,7 +3259,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) if (grow <= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { - if (vma->vm_flags & VM_LOCKED) + if (vma_test(vma, VMA_LOCKED_BIT)) mm->locked_vm += grow; vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); @@ -3297,7 +3308,7 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) return -ENOMEM; - if ((vma->vm_flags & VM_ACCOUNT) && + if (vma_test(vma, VMA_ACCOUNT_BIT) && security_vm_enough_memory_mm(mm, charged)) return -ENOMEM; @@ -3319,7 +3330,7 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) } if (vma_link(mm, vma)) { - if (vma->vm_flags & VM_ACCOUNT) + if (vma_test(vma, VMA_ACCOUNT_BIT)) vm_unacct_memory(charged); return -ENOMEM; } diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 93ea600d0895..58a621ec389f 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -267,8 +267,10 @@ enum { #endif /* CONFIG_ARCH_HAS_PKEYS */ #if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS) #define VM_SHADOW_STACK INIT_VM_FLAG(SHADOW_STACK) +#define VMA_STARTGAP_FLAGS mk_vma_flags(VMA_GROWSDOWN_BIT, VMA_SHADOW_STACK_BIT) #else #define VM_SHADOW_STACK VM_NONE +#define VMA_STARTGAP_FLAGS mk_vma_flags(VMA_GROWSDOWN_BIT) #endif #if defined(CONFIG_PPC64) #define VM_SAO INIT_VM_FLAG(SAO) @@ -366,6 +368,8 @@ enum { /* This mask represents all the VMA flag bits used by mlock */ #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) +#define VMA_LOCKED_MASK mk_vma_flags(VMA_LOCKED_BIT, VMA_LOCKONFAULT_BIT) + #define RLIMIT_STACK 3 /* max stack size */ #define RLIMIT_MEMLOCK 8 /* max locked-in-memory address space */ diff --git a/tools/testing/vma/include/stubs.h b/tools/testing/vma/include/stubs.h index b5dced3b0bd4..5afb0afe2d48 100644 --- a/tools/testing/vma/include/stubs.h +++ b/tools/testing/vma/include/stubs.h @@ -229,7 +229,7 @@ static inline bool signal_pending(void *p) return false; } -static inline bool is_file_hugepages(struct file *file) +static inline bool is_file_hugepages(const struct file *file) { return false; } -- cgit v1.2.3 From a06eb2f8279e0b2b42799d42041f144377f5a086 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:40 +0000 Subject: mm/vma: convert vma_modify_flags[_uffd]() to use vma_flags_t Update the vma_modify_flags() and vma_modify_flags_uffd() functions to accept a vma_flags_t parameter rather than a vm_flags_t one, and propagate the changes as needed to implement this change. Also add vma_flags_reset_once() in replacement of vm_flags_reset_once(). We still need to be careful here because we need to avoid tearing, so maintain the assumption that the first system word set of flags are the only ones that require protection from tearing, and retain this functionality. We can copy the remainder of VMA flags above 64 bits normally. But hopefully by the time that happens, we will have replaced the logic that requires these WRITE_ONCE()'s with something else. We also replace instances of vm_flags_reset() with a simple write of VMA flags. We are no longer perform a number of checks, most notable of all the VMA flags asserts becase: 1. We might be operating on a VMA that is not yet added to the tree. 2. We might be operating on a VMA that is now detached. 3. Really in all but core code, you should be using vma_desc_xxx(). 4. Other VMA fields are manipulated with no such checks. 5. It'd be egregious to have to add variants of flag functions just to account for cases such as the above, especially when we don't do so for other VMA fields. Drivers are the problematic cases and why it was especially important (and also for debug as VMA locks were introduced), the mmap_prepare work is solving this generally. Additionally, we can fairly safely assume by this point the soft dirty flags are being set correctly, so it's reasonable to drop this also. Finally, update the VMA tests to reflect this. Link: https://lkml.kernel.org/r/51afbb2b8c3681003cc7926647e37335d793836e.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 22 ++++++++++------------ include/linux/userfaultfd_k.h | 3 +++ mm/madvise.c | 10 ++++++---- mm/mlock.c | 38 +++++++++++++++++++++----------------- mm/mprotect.c | 7 +++---- mm/mseal.c | 11 +++++++---- mm/userfaultfd.c | 21 ++++++++++++++------- mm/vma.c | 15 ++++++++------- mm/vma.h | 15 +++++++-------- tools/testing/vma/include/dup.h | 22 +++++++++++++--------- tools/testing/vma/tests/merge.c | 3 +-- 11 files changed, 93 insertions(+), 74 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index c6b40dc88918..72bc5016094b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -954,22 +954,20 @@ static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_init(vma, flags); } -static inline void vm_flags_reset_once(struct vm_area_struct *vma, - vm_flags_t flags) +static inline void vma_flags_reset_once(struct vm_area_struct *vma, + vma_flags_t *flags) { - vma_assert_write_locked(vma); - /* - * If VMA flags exist beyond the first system word, also clear these. It - * is assumed the write once behaviour is required only for the first - * system word. - */ + const unsigned long word = flags->__vma_flags[0]; + + /* It is assumed only the first system word must be written once. */ + vma_flags_overwrite_word_once(&vma->flags, word); + /* The remainder can be copied normally. */ if (NUM_VMA_FLAG_BITS > BITS_PER_LONG) { - unsigned long *bitmap = vma->flags.__vma_flags; + unsigned long *dst = &vma->flags.__vma_flags[1]; + const unsigned long *src = &flags->__vma_flags[1]; - bitmap_zero(&bitmap[1], NUM_VMA_FLAG_BITS - BITS_PER_LONG); + bitmap_copy(dst, src, NUM_VMA_FLAG_BITS - BITS_PER_LONG); } - - vma_flags_overwrite_word_once(&vma->flags, flags); } static inline void vm_flags_set(struct vm_area_struct *vma, diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index fd5f42765497..d83e349900a3 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -23,6 +23,9 @@ /* The set of all possible UFFD-related VM flags. */ #define __VM_UFFD_FLAGS (VM_UFFD_MISSING | VM_UFFD_WP | VM_UFFD_MINOR) +#define __VMA_UFFD_FLAGS mk_vma_flags(VMA_UFFD_MISSING_BIT, VMA_UFFD_WP_BIT, \ + VMA_UFFD_MINOR_BIT) + /* * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining * new flags, since they might collide with O_* ones. We want diff --git a/mm/madvise.c b/mm/madvise.c index afe0f01765c4..69708e953cf5 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -151,13 +151,15 @@ static int madvise_update_vma(vm_flags_t new_flags, struct madvise_behavior *madv_behavior) { struct vm_area_struct *vma = madv_behavior->vma; + vma_flags_t new_vma_flags = legacy_to_vma_flags(new_flags); struct madvise_behavior_range *range = &madv_behavior->range; struct anon_vma_name *anon_name = madv_behavior->anon_name; bool set_new_anon_name = madv_behavior->behavior == __MADV_SET_ANON_VMA_NAME; VMA_ITERATOR(vmi, madv_behavior->mm, range->start); - if (new_flags == vma->vm_flags && (!set_new_anon_name || - anon_vma_name_eq(anon_vma_name(vma), anon_name))) + if (vma_flags_same_mask(&vma->flags, new_vma_flags) && + (!set_new_anon_name || + anon_vma_name_eq(anon_vma_name(vma), anon_name))) return 0; if (set_new_anon_name) @@ -165,7 +167,7 @@ static int madvise_update_vma(vm_flags_t new_flags, range->start, range->end, anon_name); else vma = vma_modify_flags(&vmi, madv_behavior->prev, vma, - range->start, range->end, &new_flags); + range->start, range->end, &new_vma_flags); if (IS_ERR(vma)) return PTR_ERR(vma); @@ -174,7 +176,7 @@ static int madvise_update_vma(vm_flags_t new_flags, /* vm_flags is protected by the mmap_lock held in write mode. */ vma_start_write(vma); - vm_flags_reset(vma, new_flags); + vma->flags = new_vma_flags; if (set_new_anon_name) return replace_anon_vma_name(vma, anon_name); diff --git a/mm/mlock.c b/mm/mlock.c index fd648138bc72..fdbd1434a35f 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -415,13 +415,14 @@ out: * @vma - vma containing range to be mlock()ed or munlock()ed * @start - start address in @vma of the range * @end - end of range in @vma - * @newflags - the new set of flags for @vma. + * @new_vma_flags - the new set of flags for @vma. * * Called for mlock(), mlock2() and mlockall(), to set @vma VM_LOCKED; * called for munlock() and munlockall(), to clear VM_LOCKED from @vma. */ static void mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, vm_flags_t newflags) + unsigned long start, unsigned long end, + vma_flags_t *new_vma_flags) { static const struct mm_walk_ops mlock_walk_ops = { .pmd_entry = mlock_pte_range, @@ -439,18 +440,18 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, * combination should not be visible to other mmap_lock users; * but WRITE_ONCE so rmap walkers must see VM_IO if VM_LOCKED. */ - if (newflags & VM_LOCKED) - newflags |= VM_IO; + if (vma_flags_test(new_vma_flags, VMA_LOCKED_BIT)) + vma_flags_set(new_vma_flags, VMA_IO_BIT); vma_start_write(vma); - vm_flags_reset_once(vma, newflags); + vma_flags_reset_once(vma, new_vma_flags); lru_add_drain(); walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL); lru_add_drain(); - if (newflags & VM_IO) { - newflags &= ~VM_IO; - vm_flags_reset_once(vma, newflags); + if (vma_flags_test(new_vma_flags, VMA_IO_BIT)) { + vma_flags_clear(new_vma_flags, VMA_IO_BIT); + vma_flags_reset_once(vma, new_vma_flags); } } @@ -467,20 +468,22 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, vm_flags_t newflags) { + vma_flags_t new_vma_flags = legacy_to_vma_flags(newflags); + const vma_flags_t old_vma_flags = vma->flags; struct mm_struct *mm = vma->vm_mm; int nr_pages; int ret = 0; - vm_flags_t oldflags = vma->vm_flags; - if (newflags == oldflags || vma_is_secretmem(vma) || - !vma_supports_mlock(vma)) + if (vma_flags_same_pair(&old_vma_flags, &new_vma_flags) || + vma_is_secretmem(vma) || !vma_supports_mlock(vma)) { /* * Don't set VM_LOCKED or VM_LOCKONFAULT and don't count. * For secretmem, don't allow the memory to be unlocked. */ goto out; + } - vma = vma_modify_flags(vmi, *prev, vma, start, end, &newflags); + vma = vma_modify_flags(vmi, *prev, vma, start, end, &new_vma_flags); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto out; @@ -490,9 +493,9 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, * Keep track of amount of locked VM. */ nr_pages = (end - start) >> PAGE_SHIFT; - if (!(newflags & VM_LOCKED)) + if (!vma_flags_test(&new_vma_flags, VMA_LOCKED_BIT)) nr_pages = -nr_pages; - else if (oldflags & VM_LOCKED) + else if (vma_flags_test(&old_vma_flags, VMA_LOCKED_BIT)) nr_pages = 0; mm->locked_vm += nr_pages; @@ -501,12 +504,13 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, * It's okay if try_to_unmap_one unmaps a page just after we * set VM_LOCKED, populate_vma_page_range will bring it back. */ - if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) { + if (vma_flags_test(&new_vma_flags, VMA_LOCKED_BIT) && + vma_flags_test(&old_vma_flags, VMA_LOCKED_BIT)) { /* No work to do, and mlocking twice would be wrong */ vma_start_write(vma); - vm_flags_reset(vma, newflags); + vma->flags = new_vma_flags; } else { - mlock_vma_pages_range(vma, start, end, newflags); + mlock_vma_pages_range(vma, start, end, &new_vma_flags); } out: *prev = vma; diff --git a/mm/mprotect.c b/mm/mprotect.c index eaa724b99908..941f1211da0d 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -756,13 +756,11 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, vma_flags_clear(&new_vma_flags, VMA_ACCOUNT_BIT); } - newflags = vma_flags_to_legacy(new_vma_flags); - vma = vma_modify_flags(vmi, *pprev, vma, start, end, &newflags); + vma = vma_modify_flags(vmi, *pprev, vma, start, end, &new_vma_flags); if (IS_ERR(vma)) { error = PTR_ERR(vma); goto fail; } - new_vma_flags = legacy_to_vma_flags(newflags); *pprev = vma; @@ -771,7 +769,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, * held in write mode. */ vma_start_write(vma); - vm_flags_reset_once(vma, newflags); + vma_flags_reset_once(vma, &new_vma_flags); if (vma_wants_manual_pte_write_upgrade(vma)) mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; vma_set_page_prot(vma); @@ -796,6 +794,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, } vm_stat_account(mm, vma_flags_to_legacy(old_vma_flags), -nrpages); + newflags = vma_flags_to_legacy(new_vma_flags); vm_stat_account(mm, newflags, nrpages); perf_event_mmap(vma); return 0; diff --git a/mm/mseal.c b/mm/mseal.c index ac58643181f7..e2093ae3d25c 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -68,14 +68,17 @@ static int mseal_apply(struct mm_struct *mm, const unsigned long curr_start = MAX(vma->vm_start, start); const unsigned long curr_end = MIN(vma->vm_end, end); - if (!(vma->vm_flags & VM_SEALED)) { - vm_flags_t vm_flags = vma->vm_flags | VM_SEALED; + if (!vma_test(vma, VMA_SEALED_BIT)) { + vma_flags_t vma_flags = vma->flags; + + vma_flags_set(&vma_flags, VMA_SEALED_BIT); vma = vma_modify_flags(&vmi, prev, vma, curr_start, - curr_end, &vm_flags); + curr_end, &vma_flags); if (IS_ERR(vma)) return PTR_ERR(vma); - vm_flags_set(vma, VM_SEALED); + vma_start_write(vma); + vma_set_flags(vma, VMA_SEALED_BIT); } prev = vma; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 2c565c7134b6..89879c3ba344 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1976,6 +1976,9 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, { struct vm_area_struct *ret; bool give_up_on_oom = false; + vma_flags_t new_vma_flags = vma->flags; + + vma_flags_clear_mask(&new_vma_flags, __VMA_UFFD_FLAGS); /* * If we are modifying only and not splitting, just give up on the merge @@ -1989,8 +1992,8 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, uffd_wp_range(vma, start, end - start, false); ret = vma_modify_flags_uffd(vmi, prev, vma, start, end, - vma->vm_flags & ~__VM_UFFD_FLAGS, - NULL_VM_UFFD_CTX, give_up_on_oom); + &new_vma_flags, NULL_VM_UFFD_CTX, + give_up_on_oom); /* * In the vma_merge() successful mprotect-like case 8: @@ -2010,10 +2013,11 @@ int userfaultfd_register_range(struct userfaultfd_ctx *ctx, unsigned long start, unsigned long end, bool wp_async) { + vma_flags_t vma_flags = legacy_to_vma_flags(vm_flags); VMA_ITERATOR(vmi, ctx->mm, start); struct vm_area_struct *prev = vma_prev(&vmi); unsigned long vma_end; - vm_flags_t new_flags; + vma_flags_t new_vma_flags; if (vma->vm_start < start) prev = vma; @@ -2024,23 +2028,26 @@ int userfaultfd_register_range(struct userfaultfd_ctx *ctx, VM_WARN_ON_ONCE(!vma_can_userfault(vma, vm_flags, wp_async)); VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx && vma->vm_userfaultfd_ctx.ctx != ctx); - VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)); + VM_WARN_ON_ONCE(!vma_test(vma, VMA_MAYWRITE_BIT)); /* * Nothing to do: this vma is already registered into this * userfaultfd and with the right tracking mode too. */ if (vma->vm_userfaultfd_ctx.ctx == ctx && - (vma->vm_flags & vm_flags) == vm_flags) + vma_test_all_mask(vma, vma_flags)) goto skip; if (vma->vm_start > start) start = vma->vm_start; vma_end = min(end, vma->vm_end); - new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; + new_vma_flags = vma->flags; + vma_flags_clear_mask(&new_vma_flags, __VMA_UFFD_FLAGS); + vma_flags_set_mask(&new_vma_flags, vma_flags); + vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, - new_flags, + &new_vma_flags, (struct vm_userfaultfd_ctx){ctx}, /* give_up_on_oom = */false); if (IS_ERR(vma)) diff --git a/mm/vma.c b/mm/vma.c index 9d194f8e7acb..16a1d708c978 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -1710,13 +1710,13 @@ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg) struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, - vm_flags_t *vm_flags_ptr) + vma_flags_t *vma_flags_ptr) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); - const vm_flags_t vm_flags = *vm_flags_ptr; + const vma_flags_t vma_flags = *vma_flags_ptr; struct vm_area_struct *ret; - vmg.vm_flags = vm_flags; + vmg.vma_flags = vma_flags; ret = vma_modify(&vmg); if (IS_ERR(ret)) @@ -1728,7 +1728,7 @@ struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, * them to the caller. */ if (vmg.state == VMA_MERGE_SUCCESS) - *vm_flags_ptr = ret->vm_flags; + *vma_flags_ptr = ret->flags; return ret; } @@ -1758,12 +1758,13 @@ struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, - unsigned long start, unsigned long end, vm_flags_t vm_flags, - struct vm_userfaultfd_ctx new_ctx, bool give_up_on_oom) + unsigned long start, unsigned long end, + const vma_flags_t *vma_flags, struct vm_userfaultfd_ctx new_ctx, + bool give_up_on_oom) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); - vmg.vm_flags = vm_flags; + vmg.vma_flags = *vma_flags; vmg.uffd_ctx = new_ctx; if (give_up_on_oom) vmg.give_up_on_oom = true; diff --git a/mm/vma.h b/mm/vma.h index 1f2de6cb3b97..270008e5babc 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -342,24 +342,23 @@ void unmap_region(struct unmap_desc *unmap); * @vma: The VMA containing the range @start to @end to be updated. * @start: The start of the range to update. May be offset within @vma. * @end: The exclusive end of the range to update, may be offset within @vma. - * @vm_flags_ptr: A pointer to the VMA flags that the @start to @end range is + * @vma_flags_ptr: A pointer to the VMA flags that the @start to @end range is * about to be set to. On merge, this will be updated to include sticky flags. * * IMPORTANT: The actual modification being requested here is NOT applied, * rather the VMA is perhaps split, perhaps merged to accommodate the change, * and the caller is expected to perform the actual modification. * - * In order to account for sticky VMA flags, the @vm_flags_ptr parameter points + * In order to account for sticky VMA flags, the @vma_flags_ptr parameter points * to the requested flags which are then updated so the caller, should they * overwrite any existing flags, correctly retains these. * * Returns: A VMA which contains the range @start to @end ready to have its - * flags altered to *@vm_flags. + * flags altered to *@vma_flags. */ __must_check struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, - unsigned long start, unsigned long end, - vm_flags_t *vm_flags_ptr); + unsigned long start, unsigned long end, vma_flags_t *vma_flags_ptr); /** * vma_modify_name() - Perform any necessary split/merge in preparation for @@ -418,7 +417,7 @@ __must_check struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, * @vma: The VMA containing the range @start to @end to be updated. * @start: The start of the range to update. May be offset within @vma. * @end: The exclusive end of the range to update, may be offset within @vma. - * @vm_flags: The VMA flags that the @start to @end range is about to be set to. + * @vma_flags: The VMA flags that the @start to @end range is about to be set to. * @new_ctx: The userfaultfd context that the @start to @end range is about to * be set to. * @give_up_on_oom: If an out of memory condition occurs on merge, simply give @@ -429,11 +428,11 @@ __must_check struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, * and the caller is expected to perform the actual modification. * * Returns: A VMA which contains the range @start to @end ready to have its VMA - * flags changed to @vm_flags and its userfaultfd context changed to @new_ctx. + * flags changed to @vma_flags and its userfaultfd context changed to @new_ctx. */ __must_check struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, - unsigned long start, unsigned long end, vm_flags_t vm_flags, + unsigned long start, unsigned long end, const vma_flags_t *vma_flags, struct vm_userfaultfd_ctx new_ctx, bool give_up_on_oom); __must_check struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg); diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 58a621ec389f..9dd57f50ea6d 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -871,16 +871,20 @@ static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_init(vma, flags); } -static inline void vm_flags_reset_once(struct vm_area_struct *vma, - vm_flags_t flags) +static inline void vma_flags_reset_once(struct vm_area_struct *vma, + vma_flags_t *flags) { - vma_assert_write_locked(vma); - /* - * The user should only be interested in avoiding reordering of - * assignment to the first word. - */ - vma_flags_clear_all(&vma->flags); - vma_flags_overwrite_word_once(&vma->flags, flags); + const unsigned long word = flags->__vma_flags[0]; + + /* It is assumed only the first system word must be written once. */ + vma_flags_overwrite_word_once(&vma->flags, word); + /* The remainder can be copied normally. */ + if (NUM_VMA_FLAG_BITS > BITS_PER_LONG) { + unsigned long *dst = &vma->flags.__vma_flags[1]; + const unsigned long *src = &flags->__vma_flags[1]; + + bitmap_copy(dst, src, NUM_VMA_FLAG_BITS - BITS_PER_LONG); + } } static inline void vm_flags_set(struct vm_area_struct *vma, diff --git a/tools/testing/vma/tests/merge.c b/tools/testing/vma/tests/merge.c index 44e3977e3fc0..03b6f9820e0a 100644 --- a/tools/testing/vma/tests/merge.c +++ b/tools/testing/vma/tests/merge.c @@ -132,7 +132,6 @@ static bool test_simple_modify(void) struct vm_area_struct *vma; vma_flags_t vma_flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT); - vm_flags_t legacy_flags = VM_READ | VM_WRITE; struct mm_struct mm = {}; struct vm_area_struct *init_vma = alloc_vma(&mm, 0, 0x3000, 0, vma_flags); VMA_ITERATOR(vmi, &mm, 0x1000); @@ -144,7 +143,7 @@ static bool test_simple_modify(void) * performs the merge/split only. */ vma = vma_modify_flags(&vmi, init_vma, init_vma, - 0x1000, 0x2000, &legacy_flags); + 0x1000, 0x2000, &vma_flags); ASSERT_NE(vma, NULL); /* We modify the provided VMA, and on split allocate new VMAs. */ ASSERT_EQ(vma, init_vma); -- cgit v1.2.3 From 90cb921c4d7bf92854344d3e76561f48784c613e Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:41 +0000 Subject: mm/vma: convert __mmap_region() to use vma_flags_t Update the mmap() implementation logic implemented in __mmap_region() and functions invoked by it. The mmap_region() function converts its input vm_flags_t parameter to a vma_flags_t value which it then passes to __mmap_region() which uses the vma_flags_t value consistently from then on. As part of the change, we convert map_deny_write_exec() to using vma_flags_t (it was incorrectly using unsigned long before), and place it in vma.h, as it is only used internal to mm. With this change, we eliminate the legacy is_shared_maywrite_vm_flags() helper function which is now no longer required. We are also able to update the MMAP_STATE() and VMG_MMAP_STATE() macros to use the vma_flags_t value. Finally, we update the VMA tests to reflect the change. Link: https://lkml.kernel.org/r/1fc33a404c962f02da778da100387cc19bd62153.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 18 ++++++++++----- include/linux/mman.h | 49 --------------------------------------- mm/mprotect.c | 4 +++- mm/vma.c | 25 ++++++++++---------- mm/vma.h | 51 +++++++++++++++++++++++++++++++++++++++++ tools/testing/vma/include/dup.h | 34 ++++++--------------------- tools/testing/vma/tests/mmap.c | 18 +++++---------- 7 files changed, 92 insertions(+), 107 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 72bc5016094b..9472b3c9a22b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1522,12 +1522,6 @@ static inline bool vma_is_accessible(const struct vm_area_struct *vma) return vma->vm_flags & VM_ACCESS_FLAGS; } -static inline bool is_shared_maywrite_vm_flags(vm_flags_t vm_flags) -{ - return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == - (VM_SHARED | VM_MAYWRITE); -} - static inline bool is_shared_maywrite(const vma_flags_t *flags) { return vma_flags_test_all(flags, VMA_SHARED_BIT, VMA_MAYWRITE_BIT); @@ -4335,12 +4329,24 @@ static inline bool range_in_vma(const struct vm_area_struct *vma, #ifdef CONFIG_MMU pgprot_t vm_get_page_prot(vm_flags_t vm_flags); + +static inline pgprot_t vma_get_page_prot(vma_flags_t vma_flags) +{ + const vm_flags_t vm_flags = vma_flags_to_legacy(vma_flags); + + return vm_get_page_prot(vm_flags); +} + void vma_set_page_prot(struct vm_area_struct *vma); #else static inline pgprot_t vm_get_page_prot(vm_flags_t vm_flags) { return __pgprot(0); } +static inline pgprot_t vma_get_page_prot(vma_flags_t vma_flags) +{ + return __pgprot(0); +} static inline void vma_set_page_prot(struct vm_area_struct *vma) { vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); diff --git a/include/linux/mman.h b/include/linux/mman.h index 0ba8a7e8b90a..389521594c69 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -170,53 +170,4 @@ static inline bool arch_memory_deny_write_exec_supported(void) } #define arch_memory_deny_write_exec_supported arch_memory_deny_write_exec_supported #endif - -/* - * Denies creating a writable executable mapping or gaining executable permissions. - * - * This denies the following: - * - * a) mmap(PROT_WRITE | PROT_EXEC) - * - * b) mmap(PROT_WRITE) - * mprotect(PROT_EXEC) - * - * c) mmap(PROT_WRITE) - * mprotect(PROT_READ) - * mprotect(PROT_EXEC) - * - * But allows the following: - * - * d) mmap(PROT_READ | PROT_EXEC) - * mmap(PROT_READ | PROT_EXEC | PROT_BTI) - * - * This is only applicable if the user has set the Memory-Deny-Write-Execute - * (MDWE) protection mask for the current process. - * - * @old specifies the VMA flags the VMA originally possessed, and @new the ones - * we propose to set. - * - * Return: false if proposed change is OK, true if not ok and should be denied. - */ -static inline bool map_deny_write_exec(unsigned long old, unsigned long new) -{ - /* If MDWE is disabled, we have nothing to deny. */ - if (!mm_flags_test(MMF_HAS_MDWE, current->mm)) - return false; - - /* If the new VMA is not executable, we have nothing to deny. */ - if (!(new & VM_EXEC)) - return false; - - /* Under MDWE we do not accept newly writably executable VMAs... */ - if (new & VM_WRITE) - return true; - - /* ...nor previously non-executable VMAs becoming executable. */ - if (!(old & VM_EXEC)) - return true; - - return false; -} - #endif /* _LINUX_MMAN_H */ diff --git a/mm/mprotect.c b/mm/mprotect.c index 941f1211da0d..007d9a72b2f0 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -882,6 +882,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, tmp = vma->vm_start; for_each_vma_range(vmi, vma, end) { vm_flags_t mask_off_old_flags; + vma_flags_t new_vma_flags; vm_flags_t newflags; int new_vma_pkey; @@ -904,6 +905,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey); newflags = calc_vm_prot_bits(prot, new_vma_pkey); newflags |= (vma->vm_flags & ~mask_off_old_flags); + new_vma_flags = legacy_to_vma_flags(newflags); /* newflags >> 4 shift VM_MAY% in place of VM_% */ if ((newflags & ~(newflags >> 4)) & VM_ACCESS_FLAGS) { @@ -911,7 +913,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, break; } - if (map_deny_write_exec(vma->vm_flags, newflags)) { + if (map_deny_write_exec(&vma->flags, &new_vma_flags)) { error = -EACCES; break; } diff --git a/mm/vma.c b/mm/vma.c index 16a1d708c978..c335f989586f 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -44,7 +44,7 @@ struct mmap_state { bool file_doesnt_need_get :1; }; -#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vm_flags_, file_) \ +#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vma_flags_, file_) \ struct mmap_state name = { \ .mm = mm_, \ .vmi = vmi_, \ @@ -52,9 +52,9 @@ struct mmap_state { .end = (addr_) + (len_), \ .pgoff = pgoff_, \ .pglen = PHYS_PFN(len_), \ - .vm_flags = vm_flags_, \ + .vma_flags = vma_flags_, \ .file = file_, \ - .page_prot = vm_get_page_prot(vm_flags_), \ + .page_prot = vma_get_page_prot(vma_flags_), \ } #define VMG_MMAP_STATE(name, map_, vma_) \ @@ -63,7 +63,7 @@ struct mmap_state { .vmi = (map_)->vmi, \ .start = (map_)->addr, \ .end = (map_)->end, \ - .vm_flags = (map_)->vm_flags, \ + .vma_flags = (map_)->vma_flags, \ .pgoff = (map_)->pgoff, \ .file = (map_)->file, \ .prev = (map_)->prev, \ @@ -2746,14 +2746,14 @@ static int call_action_complete(struct mmap_state *map, } static unsigned long __mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf) + unsigned long len, vma_flags_t vma_flags, + unsigned long pgoff, struct list_head *uf) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; bool have_mmap_prepare = file && file->f_op->mmap_prepare; VMA_ITERATOR(vmi, mm, addr); - MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file); + MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vma_flags, file); struct vm_area_desc desc = { .mm = mm, .file = file, @@ -2837,16 +2837,17 @@ abort_munmap: * been performed. */ unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf) + unsigned long len, vm_flags_t vm_flags, + unsigned long pgoff, struct list_head *uf) { unsigned long ret; bool writable_file_mapping = false; + const vma_flags_t vma_flags = legacy_to_vma_flags(vm_flags); mmap_assert_write_locked(current->mm); /* Check to see if MDWE is applicable. */ - if (map_deny_write_exec(vm_flags, vm_flags)) + if (map_deny_write_exec(&vma_flags, &vma_flags)) return -EACCES; /* Allow architectures to sanity-check the vm_flags. */ @@ -2854,7 +2855,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, return -EINVAL; /* Map writable and ensure this isn't a sealed memfd. */ - if (file && is_shared_maywrite_vm_flags(vm_flags)) { + if (file && is_shared_maywrite(&vma_flags)) { int error = mapping_map_writable(file->f_mapping); if (error) @@ -2862,7 +2863,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, writable_file_mapping = true; } - ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf); + ret = __mmap_region(file, addr, len, vma_flags, pgoff, uf); /* Clear our write mapping regardless of error. */ if (writable_file_mapping) diff --git a/mm/vma.h b/mm/vma.h index 270008e5babc..adc18f7dd9f1 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -704,4 +704,55 @@ int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); #endif +#ifdef CONFIG_MMU +/* + * Denies creating a writable executable mapping or gaining executable permissions. + * + * This denies the following: + * + * a) mmap(PROT_WRITE | PROT_EXEC) + * + * b) mmap(PROT_WRITE) + * mprotect(PROT_EXEC) + * + * c) mmap(PROT_WRITE) + * mprotect(PROT_READ) + * mprotect(PROT_EXEC) + * + * But allows the following: + * + * d) mmap(PROT_READ | PROT_EXEC) + * mmap(PROT_READ | PROT_EXEC | PROT_BTI) + * + * This is only applicable if the user has set the Memory-Deny-Write-Execute + * (MDWE) protection mask for the current process. + * + * @old specifies the VMA flags the VMA originally possessed, and @new the ones + * we propose to set. + * + * Return: false if proposed change is OK, true if not ok and should be denied. + */ +static inline bool map_deny_write_exec(const vma_flags_t *old, + const vma_flags_t *new) +{ + /* If MDWE is disabled, we have nothing to deny. */ + if (!mm_flags_test(MMF_HAS_MDWE, current->mm)) + return false; + + /* If the new VMA is not executable, we have nothing to deny. */ + if (!vma_flags_test(new, VMA_EXEC_BIT)) + return false; + + /* Under MDWE we do not accept newly writably executable VMAs... */ + if (vma_flags_test(new, VMA_WRITE_BIT)) + return true; + + /* ...nor previously non-executable VMAs becoming executable. */ + if (!vma_flags_test(old, VMA_EXEC_BIT)) + return true; + + return false; +} +#endif + #endif /* __MM_VMA_H */ diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 9dd57f50ea6d..ab92358b082c 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -1124,12 +1124,6 @@ static __always_inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc, #define vma_desc_clear_flags(desc, ...) \ vma_desc_clear_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) -static inline bool is_shared_maywrite_vm_flags(vm_flags_t vm_flags) -{ - return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == - (VM_SHARED | VM_MAYWRITE); -} - static inline bool is_shared_maywrite(const vma_flags_t *flags) { return vma_flags_test_all(flags, VMA_SHARED_BIT, VMA_MAYWRITE_BIT); @@ -1446,27 +1440,6 @@ static inline bool mlock_future_ok(const struct mm_struct *mm, return locked_pages <= limit_pages; } -static inline bool map_deny_write_exec(unsigned long old, unsigned long new) -{ - /* If MDWE is disabled, we have nothing to deny. */ - if (mm_flags_test(MMF_HAS_MDWE, current->mm)) - return false; - - /* If the new VMA is not executable, we have nothing to deny. */ - if (!(new & VM_EXEC)) - return false; - - /* Under MDWE we do not accept newly writably executable VMAs... */ - if (new & VM_WRITE) - return true; - - /* ...nor previously non-executable VMAs becoming executable. */ - if (!(old & VM_EXEC)) - return true; - - return false; -} - static inline int mapping_map_writable(struct address_space *mapping) { return atomic_inc_unless_negative(&mapping->i_mmap_writable) ? @@ -1518,3 +1491,10 @@ static inline int get_sysctl_max_map_count(void) #ifndef pgtable_supports_soft_dirty #define pgtable_supports_soft_dirty() IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) #endif + +static inline pgprot_t vma_get_page_prot(vma_flags_t vma_flags) +{ + const vm_flags_t vm_flags = vma_flags_to_legacy(vma_flags); + + return vm_get_page_prot(vm_flags); +} diff --git a/tools/testing/vma/tests/mmap.c b/tools/testing/vma/tests/mmap.c index bded4ecbe5db..c85bc000d1cb 100644 --- a/tools/testing/vma/tests/mmap.c +++ b/tools/testing/vma/tests/mmap.c @@ -2,6 +2,8 @@ static bool test_mmap_region_basic(void) { + const vma_flags_t vma_flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, + VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT); struct mm_struct mm = {}; unsigned long addr; struct vm_area_struct *vma; @@ -10,27 +12,19 @@ static bool test_mmap_region_basic(void) current->mm = &mm; /* Map at 0x300000, length 0x3000. */ - addr = __mmap_region(NULL, 0x300000, 0x3000, - VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, - 0x300, NULL); + addr = __mmap_region(NULL, 0x300000, 0x3000, vma_flags, 0x300, NULL); ASSERT_EQ(addr, 0x300000); /* Map at 0x250000, length 0x3000. */ - addr = __mmap_region(NULL, 0x250000, 0x3000, - VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, - 0x250, NULL); + addr = __mmap_region(NULL, 0x250000, 0x3000, vma_flags, 0x250, NULL); ASSERT_EQ(addr, 0x250000); /* Map at 0x303000, merging to 0x300000 of length 0x6000. */ - addr = __mmap_region(NULL, 0x303000, 0x3000, - VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, - 0x303, NULL); + addr = __mmap_region(NULL, 0x303000, 0x3000, vma_flags, 0x303, NULL); ASSERT_EQ(addr, 0x303000); /* Map at 0x24d000, merging to 0x250000 of length 0x6000. */ - addr = __mmap_region(NULL, 0x24d000, 0x3000, - VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, - 0x24d, NULL); + addr = __mmap_region(NULL, 0x24d000, 0x3000, vma_flags, 0x24d, NULL); ASSERT_EQ(addr, 0x24d000); ASSERT_EQ(mm.map_count, 2); -- cgit v1.2.3 From 3e4bb2706817710d9461394da8b75be79981586b Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:27 +0000 Subject: mm: various small mmap_prepare cleanups Patch series "mm: expand mmap_prepare functionality and usage", v4. This series expands the mmap_prepare functionality, which is intended to replace the deprecated f_op->mmap hook which has been the source of bugs and security issues for some time. This series starts with some cleanup of existing mmap_prepare logic, then adds documentation for the mmap_prepare call to make it easier for filesystem and driver writers to understand how it works. It then importantly adds a vm_ops->mapped hook, a key feature that was missing from mmap_prepare previously - this is invoked when a driver which specifies mmap_prepare has successfully been mapped but not merged with another VMA. mmap_prepare is invoked prior to a merge being attempted, so you cannot manipulate state such as reference counts as if it were a new mapping. The vm_ops->mapped hook allows a driver to perform tasks required at this stage, and provides symmetry against subsequent vm_ops->open,close calls. The series uses this to correct the afs implementation which wrongly manipulated reference count at mmap_prepare time. It then adds an mmap_prepare equivalent of vm_iomap_memory() - mmap_action_simple_ioremap(), then uses this to update a number of drivers. It then splits out the mmap_prepare compatibility layer (which allows for invocation of mmap_prepare hooks in an mmap() hook) in such a way as to allow for more incremental implementation of mmap_prepare hooks. It then uses this to extend mmap_prepare usage in drivers. Finally it adds an mmap_prepare equivalent of vm_map_pages(), which lays the foundation for future work which will extend mmap_prepare to DMA coherent mappings. This patch (of 21): Rather than passing arbitrary fields, pass a vm_area_desc pointer to mmap prepare functions to mmap prepare, and an action and vma pointer to mmap complete in order to put all the action-specific logic in the function actually doing the work. Additionally, allow mmap prepare functions to return an error so we can error out as soon as possible if there is something logically incorrect in the input. Update remap_pfn_range_prepare() to properly check the input range for the CoW case. Also remove io_remap_pfn_range_complete(), as we can simply set up the fields correctly in io_remap_pfn_range_prepare() and use remap_pfn_range_complete() for this. While we're here, make remap_pfn_range_prepare_vma() a little neater, and pass mmap_action directly to call_action_complete(). Then, update compat_vma_mmap() to perform its logic directly, as __compat_vma_map() is not used by anything so we don't need to export it. Also update compat_vma_mmap() to use vfs_mmap_prepare() rather than calling the mmap_prepare op directly. Finally, update the VMA userland tests to reflect the changes. Link: https://lkml.kernel.org/r/cover.1774045440.git.ljs@kernel.org Link: https://lkml.kernel.org/r/99f408e4694f44ab12bdc55fe0bd9685d3bd1117.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- include/linux/fs.h | 2 - include/linux/mm.h | 7 +-- mm/internal.h | 32 +++++----- mm/memory.c | 45 +++++++++----- mm/util.c | 121 +++++++++++++++++--------------------- mm/vma.c | 24 ++++---- tools/testing/vma/include/dup.h | 7 ++- tools/testing/vma/include/stubs.h | 8 +-- 8 files changed, 126 insertions(+), 120 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 8b3dd145b25e..a2628a12bd2b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2058,8 +2058,6 @@ static inline bool can_mmap_file(struct file *file) return true; } -int __compat_vma_mmap(const struct file_operations *f_op, - struct file *file, struct vm_area_struct *vma); int compat_vma_mmap(struct file *file, struct vm_area_struct *vma); static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/include/linux/mm.h b/include/linux/mm.h index 9472b3c9a22b..6ca2fc5ae83f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4304,10 +4304,9 @@ static inline void mmap_action_ioremap_full(struct vm_area_desc *desc, mmap_action_ioremap(desc, desc->start, start_pfn, vma_desc_size(desc)); } -void mmap_action_prepare(struct mmap_action *action, - struct vm_area_desc *desc); -int mmap_action_complete(struct mmap_action *action, - struct vm_area_struct *vma); +int mmap_action_prepare(struct vm_area_desc *desc); +int mmap_action_complete(struct vm_area_struct *vma, + struct mmap_action *action); /* Look up the first VMA which exactly match the interval vm_start ... vm_end */ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, diff --git a/mm/internal.h b/mm/internal.h index 9c690f8635da..4dddd89153d4 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1839,26 +1839,28 @@ int walk_page_range_debug(struct mm_struct *mm, unsigned long start, void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm); int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm); -void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn); -int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t pgprot); +int remap_pfn_range_prepare(struct vm_area_desc *desc); +int remap_pfn_range_complete(struct vm_area_struct *vma, + struct mmap_action *action); -static inline void io_remap_pfn_range_prepare(struct vm_area_desc *desc, - unsigned long orig_pfn, unsigned long size) +static inline int io_remap_pfn_range_prepare(struct vm_area_desc *desc) { + struct mmap_action *action = &desc->action; + const unsigned long orig_pfn = action->remap.start_pfn; + const pgprot_t orig_pgprot = action->remap.pgprot; + const unsigned long size = action->remap.size; const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size); + int err; - return remap_pfn_range_prepare(desc, pfn); -} + action->remap.start_pfn = pfn; + action->remap.pgprot = pgprot_decrypted(orig_pgprot); + err = remap_pfn_range_prepare(desc); + if (err) + return err; -static inline int io_remap_pfn_range_complete(struct vm_area_struct *vma, - unsigned long addr, unsigned long orig_pfn, unsigned long size, - pgprot_t orig_prot) -{ - const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size); - const pgprot_t prot = pgprot_decrypted(orig_prot); - - return remap_pfn_range_complete(vma, addr, pfn, size, prot); + /* Remap does the actual work. */ + action->type = MMAP_REMAP_PFN; + return 0; } #ifdef CONFIG_MMU_NOTIFIER diff --git a/mm/memory.c b/mm/memory.c index 425e852a2eb7..10a61dd81f97 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3099,26 +3099,34 @@ static int do_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, } #endif -void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn) +int remap_pfn_range_prepare(struct vm_area_desc *desc) { - /* - * We set addr=VMA start, end=VMA end here, so this won't fail, but we - * check it again on complete and will fail there if specified addr is - * invalid. - */ - get_remap_pgoff(vma_desc_is_cow_mapping(desc), desc->start, desc->end, - desc->start, desc->end, pfn, &desc->pgoff); + const struct mmap_action *action = &desc->action; + const unsigned long start = action->remap.start; + const unsigned long end = start + action->remap.size; + const unsigned long pfn = action->remap.start_pfn; + const bool is_cow = vma_desc_is_cow_mapping(desc); + int err; + + err = get_remap_pgoff(is_cow, start, end, desc->start, desc->end, pfn, + &desc->pgoff); + if (err) + return err; + vma_desc_set_flags_mask(desc, VMA_REMAP_FLAGS); + return 0; } -static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size) +static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn, + unsigned long size) { - unsigned long end = addr + PAGE_ALIGN(size); + const unsigned long end = addr + PAGE_ALIGN(size); + const bool is_cow = is_cow_mapping(vma->vm_flags); int err; - err = get_remap_pgoff(is_cow_mapping(vma->vm_flags), addr, end, - vma->vm_start, vma->vm_end, pfn, &vma->vm_pgoff); + err = get_remap_pgoff(is_cow, addr, end, vma->vm_start, vma->vm_end, + pfn, &vma->vm_pgoff); if (err) return err; @@ -3151,10 +3159,15 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(remap_pfn_range); -int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t prot) +int remap_pfn_range_complete(struct vm_area_struct *vma, + struct mmap_action *action) { - return do_remap_pfn_range(vma, addr, pfn, size, prot); + const unsigned long start = action->remap.start; + const unsigned long pfn = action->remap.start_pfn; + const unsigned long size = action->remap.size; + const pgprot_t prot = action->remap.pgprot; + + return do_remap_pfn_range(vma, start, pfn, size, prot); } /** diff --git a/mm/util.c b/mm/util.c index ce7ae80047cf..73c97a748d8e 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1163,43 +1163,6 @@ void flush_dcache_folio(struct folio *folio) EXPORT_SYMBOL(flush_dcache_folio); #endif -/** - * __compat_vma_mmap() - See description for compat_vma_mmap() - * for details. This is the same operation, only with a specific file operations - * struct which may or may not be the same as vma->vm_file->f_op. - * @f_op: The file operations whose .mmap_prepare() hook is specified. - * @file: The file which backs or will back the mapping. - * @vma: The VMA to apply the .mmap_prepare() hook to. - * Returns: 0 on success or error. - */ -int __compat_vma_mmap(const struct file_operations *f_op, - struct file *file, struct vm_area_struct *vma) -{ - struct vm_area_desc desc = { - .mm = vma->vm_mm, - .file = file, - .start = vma->vm_start, - .end = vma->vm_end, - - .pgoff = vma->vm_pgoff, - .vm_file = vma->vm_file, - .vma_flags = vma->flags, - .page_prot = vma->vm_page_prot, - - .action.type = MMAP_NOTHING, /* Default */ - }; - int err; - - err = f_op->mmap_prepare(&desc); - if (err) - return err; - - mmap_action_prepare(&desc.action, &desc); - set_vma_from_desc(vma, &desc); - return mmap_action_complete(&desc.action, vma); -} -EXPORT_SYMBOL(__compat_vma_mmap); - /** * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an * existing VMA and execute any requested actions. @@ -1228,7 +1191,31 @@ EXPORT_SYMBOL(__compat_vma_mmap); */ int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) { - return __compat_vma_mmap(file->f_op, file, vma); + struct vm_area_desc desc = { + .mm = vma->vm_mm, + .file = file, + .start = vma->vm_start, + .end = vma->vm_end, + + .pgoff = vma->vm_pgoff, + .vm_file = vma->vm_file, + .vma_flags = vma->flags, + .page_prot = vma->vm_page_prot, + + .action.type = MMAP_NOTHING, /* Default */ + }; + int err; + + err = vfs_mmap_prepare(file, &desc); + if (err) + return err; + + err = mmap_action_prepare(&desc); + if (err) + return err; + + set_vma_from_desc(vma, &desc); + return mmap_action_complete(vma, &desc.action); } EXPORT_SYMBOL(compat_vma_mmap); @@ -1320,8 +1307,8 @@ again: } } -static int mmap_action_finish(struct mmap_action *action, - const struct vm_area_struct *vma, int err) +static int mmap_action_finish(struct vm_area_struct *vma, + struct mmap_action *action, int err) { /* * If an error occurs, unmap the VMA altogether and return an error. We @@ -1353,37 +1340,38 @@ static int mmap_action_finish(struct mmap_action *action, /** * mmap_action_prepare - Perform preparatory setup for an VMA descriptor * action which need to be performed. - * @desc: The VMA descriptor to prepare for @action. - * @action: The action to perform. + * @desc: The VMA descriptor to prepare for its @desc->action. + * + * Returns: %0 on success, otherwise error. */ -void mmap_action_prepare(struct mmap_action *action, - struct vm_area_desc *desc) +int mmap_action_prepare(struct vm_area_desc *desc) { - switch (action->type) { + switch (desc->action.type) { case MMAP_NOTHING: - break; + return 0; case MMAP_REMAP_PFN: - remap_pfn_range_prepare(desc, action->remap.start_pfn); - break; + return remap_pfn_range_prepare(desc); case MMAP_IO_REMAP_PFN: - io_remap_pfn_range_prepare(desc, action->remap.start_pfn, - action->remap.size); - break; + return io_remap_pfn_range_prepare(desc); } + + WARN_ON_ONCE(1); + return -EINVAL; } EXPORT_SYMBOL(mmap_action_prepare); /** * mmap_action_complete - Execute VMA descriptor action. - * @action: The action to perform. * @vma: The VMA to perform the action upon. + * @action: The action to perform. * * Similar to mmap_action_prepare(). * * Return: 0 on success, or error, at which point the VMA will be unmapped. */ -int mmap_action_complete(struct mmap_action *action, - struct vm_area_struct *vma) +int mmap_action_complete(struct vm_area_struct *vma, + struct mmap_action *action) + { int err = 0; @@ -1391,25 +1379,22 @@ int mmap_action_complete(struct mmap_action *action, case MMAP_NOTHING: break; case MMAP_REMAP_PFN: - err = remap_pfn_range_complete(vma, action->remap.start, - action->remap.start_pfn, action->remap.size, - action->remap.pgprot); + err = remap_pfn_range_complete(vma, action); break; case MMAP_IO_REMAP_PFN: - err = io_remap_pfn_range_complete(vma, action->remap.start, - action->remap.start_pfn, action->remap.size, - action->remap.pgprot); + /* Should have been delegated. */ + WARN_ON_ONCE(1); + err = -EINVAL; break; } - return mmap_action_finish(action, vma, err); + return mmap_action_finish(vma, action, err); } EXPORT_SYMBOL(mmap_action_complete); #else -void mmap_action_prepare(struct mmap_action *action, - struct vm_area_desc *desc) +int mmap_action_prepare(struct vm_area_desc *desc) { - switch (action->type) { + switch (desc->action.type) { case MMAP_NOTHING: break; case MMAP_REMAP_PFN: @@ -1417,11 +1402,13 @@ void mmap_action_prepare(struct mmap_action *action, WARN_ON_ONCE(1); /* nommu cannot handle these. */ break; } + + return 0; } EXPORT_SYMBOL(mmap_action_prepare); -int mmap_action_complete(struct mmap_action *action, - struct vm_area_struct *vma) +int mmap_action_complete(struct vm_area_struct *vma, + struct mmap_action *action) { int err = 0; @@ -1436,7 +1423,7 @@ int mmap_action_complete(struct mmap_action *action, break; } - return mmap_action_finish(action, vma, err); + return mmap_action_finish(vma, action, err); } EXPORT_SYMBOL(mmap_action_complete); #endif diff --git a/mm/vma.c b/mm/vma.c index a4b30a069153..1e2996a12d7f 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -2640,15 +2640,18 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) vma_set_page_prot(vma); } -static void call_action_prepare(struct mmap_state *map, - struct vm_area_desc *desc) +static int call_action_prepare(struct mmap_state *map, + struct vm_area_desc *desc) { - struct mmap_action *action = &desc->action; + int err; - mmap_action_prepare(action, desc); + err = mmap_action_prepare(desc); + if (err) + return err; - if (action->hide_from_rmap_until_complete) + if (desc->action.hide_from_rmap_until_complete) map->hold_file_rmap_lock = true; + return 0; } /* @@ -2672,7 +2675,9 @@ static int call_mmap_prepare(struct mmap_state *map, if (err) return err; - call_action_prepare(map, desc); + err = call_action_prepare(map, desc); + if (err) + return err; /* Update fields permitted to be changed. */ map->pgoff = desc->pgoff; @@ -2727,13 +2732,12 @@ static bool can_set_ksm_flags_early(struct mmap_state *map) } static int call_action_complete(struct mmap_state *map, - struct vm_area_desc *desc, + struct mmap_action *action, struct vm_area_struct *vma) { - struct mmap_action *action = &desc->action; int ret; - ret = mmap_action_complete(action, vma); + ret = mmap_action_complete(vma, action); /* If we held the file rmap we need to release it. */ if (map->hold_file_rmap_lock) { @@ -2795,7 +2799,7 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, __mmap_complete(&map, vma); if (have_mmap_prepare && allocated_new) { - error = call_action_complete(&map, &desc, vma); + error = call_action_complete(&map, &desc.action, vma); if (error) return error; diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index ab92358b082c..e7581efaf470 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -1277,9 +1277,12 @@ static inline int __compat_vma_mmap(const struct file_operations *f_op, if (err) return err; - mmap_action_prepare(&desc.action, &desc); + err = mmap_action_prepare(&desc); + if (err) + return err; + set_vma_from_desc(vma, &desc); - return mmap_action_complete(&desc.action, vma); + return mmap_action_complete(vma, &desc.action); } static inline int compat_vma_mmap(struct file *file, diff --git a/tools/testing/vma/include/stubs.h b/tools/testing/vma/include/stubs.h index 5afb0afe2d48..a30b8bc84955 100644 --- a/tools/testing/vma/include/stubs.h +++ b/tools/testing/vma/include/stubs.h @@ -81,13 +81,13 @@ static inline void free_anon_vma_name(struct vm_area_struct *vma) { } -static inline void mmap_action_prepare(struct mmap_action *action, - struct vm_area_desc *desc) +static inline int mmap_action_prepare(struct vm_area_desc *desc) { + return 0; } -static inline int mmap_action_complete(struct mmap_action *action, - struct vm_area_struct *vma) +static inline int mmap_action_complete(struct vm_area_struct *vma, + struct mmap_action *action) { return 0; } -- cgit v1.2.3 From 827e97cf4bf59e9a72bcec37944bcebb3139a457 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:29 +0000 Subject: mm: document vm_operations_struct->open the same as close() Describe when the operation is invoked and the context in which it is invoked, matching the description already added for vm_op->close(). While we're here, update all outdated references to an 'area' field for VMAs to the more consistent 'vma'. Link: https://lkml.kernel.org/r/7d0ca833c12014320f0fa00f816f95e6e10076f2.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- include/linux/mm.h | 15 ++++++++++----- tools/testing/vma/include/dup.h | 15 ++++++++++----- 2 files changed, 20 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6ca2fc5ae83f..21a2eef5f8fe 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -764,15 +764,20 @@ struct vm_fault { * to the functions called when a no-page or a wp-page exception occurs. */ struct vm_operations_struct { - void (*open)(struct vm_area_struct * area); + /** + * @open: Called when a VMA is remapped, split or forked. Not called + * upon first mapping a VMA. + * Context: User context. May sleep. Caller holds mmap_lock. + */ + void (*open)(struct vm_area_struct *vma); /** * @close: Called when the VMA is being removed from the MM. * Context: User context. May sleep. Caller holds mmap_lock. */ - void (*close)(struct vm_area_struct * area); + void (*close)(struct vm_area_struct *vma); /* Called any time before splitting to check if it's allowed */ - int (*may_split)(struct vm_area_struct *area, unsigned long addr); - int (*mremap)(struct vm_area_struct *area); + int (*may_split)(struct vm_area_struct *vma, unsigned long addr); + int (*mremap)(struct vm_area_struct *vma); /* * Called by mprotect() to make driver-specific permission * checks before mprotect() is finalised. The VMA must not @@ -784,7 +789,7 @@ struct vm_operations_struct { vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order); vm_fault_t (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); - unsigned long (*pagesize)(struct vm_area_struct * area); + unsigned long (*pagesize)(struct vm_area_struct *vma); /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index e7581efaf470..5bc04c801504 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -632,15 +632,20 @@ struct vm_area_struct { } __randomize_layout; struct vm_operations_struct { - void (*open)(struct vm_area_struct * area); + /** + * @open: Called when a VMA is remapped, split or forked. Not called + * upon first mapping a VMA. + * Context: User context. May sleep. Caller holds mmap_lock. + */ + void (*open)(struct vm_area_struct *vma); /** * @close: Called when the VMA is being removed from the MM. * Context: User context. May sleep. Caller holds mmap_lock. */ - void (*close)(struct vm_area_struct * area); + void (*close)(struct vm_area_struct *vma); /* Called any time before splitting to check if it's allowed */ - int (*may_split)(struct vm_area_struct *area, unsigned long addr); - int (*mremap)(struct vm_area_struct *area); + int (*may_split)(struct vm_area_struct *vma, unsigned long addr); + int (*mremap)(struct vm_area_struct *vma); /* * Called by mprotect() to make driver-specific permission * checks before mprotect() is finalised. The VMA must not @@ -652,7 +657,7 @@ struct vm_operations_struct { vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order); vm_fault_t (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); - unsigned long (*pagesize)(struct vm_area_struct * area); + unsigned long (*pagesize)(struct vm_area_struct *vma); /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ -- cgit v1.2.3 From c50ca15dd4962bdf834945c2fa29b904042f366a Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:34 +0000 Subject: mm: add vm_ops->mapped hook Previously, when a driver needed to do something like establish a reference count, it could do so in the mmap hook in the knowledge that the mapping would succeed. With the introduction of f_op->mmap_prepare this is no longer the case, as it is invoked prior to actually establishing the mapping. mmap_prepare is not appropriate for this kind of thing as it is called before any merge might take place, and after which an error might occur meaning resources could be leaked. To take this into account, introduce a new vm_ops->mapped callback which is invoked when the VMA is first mapped (though notably - not when it is merged - which is correct and mirrors existing mmap/open/close behaviour). We do better that vm_ops->open() here, as this callback can return an error, at which point the VMA will be unmapped. Note that vm_ops->mapped() is invoked after any mmap action is complete (such as I/O remapping). We intentionally do not expose the VMA at this point, exposing only the fields that could be used, and an output parameter in case the operation needs to update the vma->vm_private_data field. In order to deal with stacked filesystems which invoke inner filesystem's mmap() invocations, add __compat_vma_mapped() and invoke it on vfs_mmap() (via compat_vma_mmap()) to ensure that the mapped callback is handled when an mmap() caller invokes a nested filesystem's mmap_prepare() callback. Update the mmap_prepare documentation to describe the mapped hook and make it clear what its intended use is. The vm_ops->mapped() call is handled by the mmap complete logic to ensure the same code paths are handled by both the compatibility and VMA layers. Additionally, update VMA userland test headers to reflect the change. Link: https://lkml.kernel.org/r/4c5e98297eb0aae9565c564e1c296a112702f144.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- Documentation/filesystems/mmap_prepare.rst | 15 +++++ include/linux/fs.h | 9 ++- include/linux/mm.h | 17 ++++++ mm/util.c | 90 +++++++++++++++++++++--------- mm/vma.c | 1 - tools/testing/vma/include/dup.h | 17 ++++++ 6 files changed, 120 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/Documentation/filesystems/mmap_prepare.rst b/Documentation/filesystems/mmap_prepare.rst index ae484d371861..f14b35ee11d5 100644 --- a/Documentation/filesystems/mmap_prepare.rst +++ b/Documentation/filesystems/mmap_prepare.rst @@ -25,6 +25,21 @@ That is - no resources should be allocated nor state updated to reflect that a mapping has been established, as the mapping may either be merged, or fail to be mapped after the callback is complete. +Mapped callback +--------------- + +If resources need to be allocated per-mapping, or state such as a reference +count needs to be manipulated, this should be done using the ``vm_ops->mapped`` +hook, which itself should be set by the >mmap_prepare hook. + +This callback is only invoked if a new mapping has been established and was not +merged with any other, and is invoked at a point where no error may occur before +the mapping is established. + +You may return an error to the callback itself, which will cause the mapping to +become unmapped and an error returned to the mmap() caller. This is useful if +resources need to be allocated, and that allocation might fail. + How To Use ========== diff --git a/include/linux/fs.h b/include/linux/fs.h index a2628a12bd2b..c390f5c667e3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2059,13 +2059,20 @@ static inline bool can_mmap_file(struct file *file) } int compat_vma_mmap(struct file *file, struct vm_area_struct *vma); +int __vma_check_mmap_hook(struct vm_area_struct *vma); static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) { + int err; + if (file->f_op->mmap_prepare) return compat_vma_mmap(file, vma); - return file->f_op->mmap(file, vma); + err = file->f_op->mmap(file, vma); + if (err) + return err; + + return __vma_check_mmap_hook(vma); } static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc) diff --git a/include/linux/mm.h b/include/linux/mm.h index 21a2eef5f8fe..81fbcfed44dd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -775,6 +775,23 @@ struct vm_operations_struct { * Context: User context. May sleep. Caller holds mmap_lock. */ void (*close)(struct vm_area_struct *vma); + /** + * @mapped: Called when the VMA is first mapped in the MM. Not called if + * the new VMA is merged with an adjacent VMA. + * + * The @vm_private_data field is an output field allowing the user to + * modify vma->vm_private_data as necessary. + * + * ONLY valid if set from f_op->mmap_prepare. Will result in an error if + * set from f_op->mmap. + * + * Returns %0 on success, or an error otherwise. On error, the VMA will + * be unmapped. + * + * Context: User context. May sleep. Caller holds mmap_lock. + */ + int (*mapped)(unsigned long start, unsigned long end, pgoff_t pgoff, + const struct file *file, void **vm_private_data); /* Called any time before splitting to check if it's allowed */ int (*may_split)(struct vm_area_struct *vma, unsigned long addr); int (*mremap)(struct vm_area_struct *vma); diff --git a/mm/util.c b/mm/util.c index e272efca8c0e..98fe67e59ec3 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1163,33 +1163,7 @@ void flush_dcache_folio(struct folio *folio) EXPORT_SYMBOL(flush_dcache_folio); #endif -/** - * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an - * existing VMA and execute any requested actions. - * @file: The file which possesss an f_op->mmap_prepare() hook. - * @vma: The VMA to apply the .mmap_prepare() hook to. - * - * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain - * stacked filesystems invoke a nested mmap hook of an underlying file. - * - * Until all filesystems are converted to use .mmap_prepare(), we must be - * conservative and continue to invoke these stacked filesystems using the - * deprecated .mmap() hook. - * - * However we have a problem if the underlying file system possesses an - * .mmap_prepare() hook, as we are in a different context when we invoke the - * .mmap() hook, already having a VMA to deal with. - * - * compat_vma_mmap() is a compatibility function that takes VMA state, - * establishes a struct vm_area_desc descriptor, passes to the underlying - * .mmap_prepare() hook and applies any changes performed by it. - * - * Once the conversion of filesystems is complete this function will no longer - * be required and will be removed. - * - * Returns: 0 on success or error. - */ -int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) +static int __compat_vma_mmap(struct file *file, struct vm_area_struct *vma) { struct vm_area_desc desc = { .mm = vma->vm_mm, @@ -1221,8 +1195,49 @@ int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) set_vma_from_desc(vma, &desc); return mmap_action_complete(vma, action); } + +/** + * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an + * existing VMA and execute any requested actions. + * @file: The file which possesss an f_op->mmap_prepare() hook. + * @vma: The VMA to apply the .mmap_prepare() hook to. + * + * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain + * stacked filesystems invoke a nested mmap hook of an underlying file. + * + * Until all filesystems are converted to use .mmap_prepare(), we must be + * conservative and continue to invoke these stacked filesystems using the + * deprecated .mmap() hook. + * + * However we have a problem if the underlying file system possesses an + * .mmap_prepare() hook, as we are in a different context when we invoke the + * .mmap() hook, already having a VMA to deal with. + * + * compat_vma_mmap() is a compatibility function that takes VMA state, + * establishes a struct vm_area_desc descriptor, passes to the underlying + * .mmap_prepare() hook and applies any changes performed by it. + * + * Once the conversion of filesystems is complete this function will no longer + * be required and will be removed. + * + * Returns: 0 on success or error. + */ +int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) +{ + return __compat_vma_mmap(file, vma); +} EXPORT_SYMBOL(compat_vma_mmap); +int __vma_check_mmap_hook(struct vm_area_struct *vma) +{ + /* vm_ops->mapped is not valid if mmap() is specified. */ + if (vma->vm_ops && WARN_ON_ONCE(vma->vm_ops->mapped)) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL(__vma_check_mmap_hook); + static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio, const struct page *page) { @@ -1311,11 +1326,32 @@ again: } } +static int call_vma_mapped(struct vm_area_struct *vma) +{ + const struct vm_operations_struct *vm_ops = vma->vm_ops; + void *vm_private_data = vma->vm_private_data; + int err; + + if (!vm_ops || !vm_ops->mapped) + return 0; + + err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff, + vma->vm_file, &vm_private_data); + if (err) + return err; + + if (vm_private_data != vma->vm_private_data) + vma->vm_private_data = vm_private_data; + return 0; +} + static int mmap_action_finish(struct vm_area_struct *vma, struct mmap_action *action, int err) { size_t len; + if (!err) + err = call_vma_mapped(vma); if (!err && action->success_hook) err = action->success_hook(vma); diff --git a/mm/vma.c b/mm/vma.c index e1950ae048e2..a43f3c5d4b3d 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -2781,7 +2781,6 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, if (have_mmap_prepare && allocated_new) { error = mmap_action_complete(vma, &desc.action); - if (error) return error; } diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index a95a4b07f68b..1fb7bcae4f31 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -643,6 +643,23 @@ struct vm_operations_struct { * Context: User context. May sleep. Caller holds mmap_lock. */ void (*close)(struct vm_area_struct *vma); + /** + * @mapped: Called when the VMA is first mapped in the MM. Not called if + * the new VMA is merged with an adjacent VMA. + * + * The @vm_private_data field is an output field allowing the user to + * modify vma->vm_private_data as necessary. + * + * ONLY valid if set from f_op->mmap_prepare. Will result in an error if + * set from f_op->mmap. + * + * Returns %0 on success, or an error otherwise. On error, the VMA will + * be unmapped. + * + * Context: User context. May sleep. Caller holds mmap_lock. + */ + int (*mapped)(unsigned long start, unsigned long end, pgoff_t pgoff, + const struct file *file, void **vm_private_data); /* Called any time before splitting to check if it's allowed */ int (*may_split)(struct vm_area_struct *vma, unsigned long addr); int (*mremap)(struct vm_area_struct *vma); -- cgit v1.2.3 From a1b7fb40cb71a33c68a609fcee0946425d698415 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:37 +0000 Subject: mm: add mmap_action_simple_ioremap() Currently drivers use vm_iomap_memory() as a simple helper function for I/O remapping memory over a range starting at a specified physical address over a specified length. In order to utilise this from mmap_prepare, separate out the core logic into __simple_ioremap_prep(), update vm_iomap_memory() to use it, and add simple_ioremap_prepare() to do the same with a VMA descriptor object. We also add MMAP_SIMPLE_IO_REMAP and relevant fields to the struct mmap_action type to permit this operation also. We use mmap_action_ioremap() to set up the actual I/O remap operation once we have checked and figured out the parameters, which makes simple_ioremap_prepare() easy to implement. We then add mmap_action_simple_ioremap() to allow drivers to make use of this mode. We update the mmap_prepare documentation to describe this mode. Finally, we update the VMA tests to reflect this change. Link: https://lkml.kernel.org/r/a08ef1c4542202684da63bb37f459d5dbbeddd91.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Suren Baghdasaryan Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- Documentation/filesystems/mmap_prepare.rst | 3 ++ include/linux/mm.h | 24 ++++++++- include/linux/mm_types.h | 6 ++- mm/internal.h | 1 + mm/memory.c | 85 +++++++++++++++++++++--------- mm/util.c | 5 ++ tools/testing/vma/include/dup.h | 6 ++- 7 files changed, 102 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/Documentation/filesystems/mmap_prepare.rst b/Documentation/filesystems/mmap_prepare.rst index f14b35ee11d5..14bb057be564 100644 --- a/Documentation/filesystems/mmap_prepare.rst +++ b/Documentation/filesystems/mmap_prepare.rst @@ -153,5 +153,8 @@ pointer. These are: * mmap_action_ioremap_full() - Same as mmap_action_ioremap(), only remaps the entire mapping from ``start_pfn`` onward. +* mmap_action_simple_ioremap() - Sets up an I/O remap from a specified + physical address and over a specified length. + **NOTE:** The ``action`` field should never normally be manipulated directly, rather you ought to use one of these helpers. diff --git a/include/linux/mm.h b/include/linux/mm.h index 81fbcfed44dd..53b21de40f87 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4321,11 +4321,33 @@ static inline void mmap_action_ioremap(struct vm_area_desc *desc, * @start_pfn: The first PFN in the range to remap. */ static inline void mmap_action_ioremap_full(struct vm_area_desc *desc, - unsigned long start_pfn) + unsigned long start_pfn) { mmap_action_ioremap(desc, desc->start, start_pfn, vma_desc_size(desc)); } +/** + * mmap_action_simple_ioremap - helper for mmap_prepare hook to specify that the + * physical range in [start_phys_addr, start_phys_addr + size) should be I/O + * remapped. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start_phys_addr: Start of the physical memory to be mapped. + * @size: Size of the area to map. + * + * NOTE: Some drivers might want to tweak desc->page_prot for purposes of + * write-combine or similar. + */ +static inline void mmap_action_simple_ioremap(struct vm_area_desc *desc, + phys_addr_t start_phys_addr, + unsigned long size) +{ + struct mmap_action *action = &desc->action; + + action->simple_ioremap.start_phys_addr = start_phys_addr; + action->simple_ioremap.size = size; + action->type = MMAP_SIMPLE_IO_REMAP; +} + int mmap_action_prepare(struct vm_area_desc *desc); int mmap_action_complete(struct vm_area_struct *vma, struct mmap_action *action); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 38fe6b915024..91a3db174d78 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -814,6 +814,7 @@ enum mmap_action_type { MMAP_NOTHING, /* Mapping is complete, no further action. */ MMAP_REMAP_PFN, /* Remap PFN range. */ MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ + MMAP_SIMPLE_IO_REMAP, /* I/O remap with guardrails. */ }; /* @@ -822,13 +823,16 @@ enum mmap_action_type { */ struct mmap_action { union { - /* Remap range. */ struct { unsigned long start; unsigned long start_pfn; unsigned long size; pgprot_t pgprot; } remap; + struct { + phys_addr_t start_phys_addr; + unsigned long size; + } simple_ioremap; }; enum mmap_action_type type; diff --git a/mm/internal.h b/mm/internal.h index 241510e21f4b..c693646e5b3f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1842,6 +1842,7 @@ int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm); int remap_pfn_range_prepare(struct vm_area_desc *desc); int remap_pfn_range_complete(struct vm_area_struct *vma, struct mmap_action *action); +int simple_ioremap_prepare(struct vm_area_desc *desc); static inline int io_remap_pfn_range_prepare(struct vm_area_desc *desc) { diff --git a/mm/memory.c b/mm/memory.c index 10a61dd81f97..c1c323512939 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3170,6 +3170,58 @@ int remap_pfn_range_complete(struct vm_area_struct *vma, return do_remap_pfn_range(vma, start, pfn, size, prot); } +static int __simple_ioremap_prep(unsigned long vm_len, pgoff_t vm_pgoff, + phys_addr_t start_phys, unsigned long size, + unsigned long *pfnp) +{ + unsigned long pfn, pages; + + /* Check that the physical memory area passed in looks valid */ + if (start_phys + size < start_phys) + return -EINVAL; + /* + * You *really* shouldn't map things that aren't page-aligned, + * but we've historically allowed it because IO memory might + * just have smaller alignment. + */ + size += start_phys & ~PAGE_MASK; + pfn = start_phys >> PAGE_SHIFT; + pages = (size + ~PAGE_MASK) >> PAGE_SHIFT; + if (pfn + pages < pfn) + return -EINVAL; + + /* We start the mapping 'vm_pgoff' pages into the area */ + if (vm_pgoff > pages) + return -EINVAL; + pfn += vm_pgoff; + pages -= vm_pgoff; + + /* Can we fit all of the mapping? */ + if ((vm_len >> PAGE_SHIFT) > pages) + return -EINVAL; + + *pfnp = pfn; + return 0; +} + +int simple_ioremap_prepare(struct vm_area_desc *desc) +{ + struct mmap_action *action = &desc->action; + const phys_addr_t start = action->simple_ioremap.start_phys_addr; + const unsigned long size = action->simple_ioremap.size; + unsigned long pfn; + int err; + + err = __simple_ioremap_prep(vma_desc_size(desc), desc->pgoff, + start, size, &pfn); + if (err) + return err; + + /* The I/O remap logic does the heavy lifting. */ + mmap_action_ioremap_full(desc, pfn); + return io_remap_pfn_range_prepare(desc); +} + /** * vm_iomap_memory - remap memory to userspace * @vma: user vma to map to @@ -3187,32 +3239,15 @@ int remap_pfn_range_complete(struct vm_area_struct *vma, */ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) { - unsigned long vm_len, pfn, pages; - - /* Check that the physical memory area passed in looks valid */ - if (start + len < start) - return -EINVAL; - /* - * You *really* shouldn't map things that aren't page-aligned, - * but we've historically allowed it because IO memory might - * just have smaller alignment. - */ - len += start & ~PAGE_MASK; - pfn = start >> PAGE_SHIFT; - pages = (len + ~PAGE_MASK) >> PAGE_SHIFT; - if (pfn + pages < pfn) - return -EINVAL; - - /* We start the mapping 'vm_pgoff' pages into the area */ - if (vma->vm_pgoff > pages) - return -EINVAL; - pfn += vma->vm_pgoff; - pages -= vma->vm_pgoff; + const unsigned long vm_start = vma->vm_start; + const unsigned long vm_end = vma->vm_end; + const unsigned long vm_len = vm_end - vm_start; + unsigned long pfn; + int err; - /* Can we fit all of the mapping? */ - vm_len = vma->vm_end - vma->vm_start; - if (vm_len >> PAGE_SHIFT > pages) - return -EINVAL; + err = __simple_ioremap_prep(vm_len, vma->vm_pgoff, start, len, &pfn); + if (err) + return err; /* Ok, let it rip */ return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); diff --git a/mm/util.c b/mm/util.c index 98fe67e59ec3..9a27d33273fd 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1393,6 +1393,8 @@ int mmap_action_prepare(struct vm_area_desc *desc) return remap_pfn_range_prepare(desc); case MMAP_IO_REMAP_PFN: return io_remap_pfn_range_prepare(desc); + case MMAP_SIMPLE_IO_REMAP: + return simple_ioremap_prepare(desc); } WARN_ON_ONCE(1); @@ -1421,6 +1423,7 @@ int mmap_action_complete(struct vm_area_struct *vma, err = remap_pfn_range_complete(vma, action); break; case MMAP_IO_REMAP_PFN: + case MMAP_SIMPLE_IO_REMAP: /* Should have been delegated. */ WARN_ON_ONCE(1); err = -EINVAL; @@ -1438,6 +1441,7 @@ int mmap_action_prepare(struct vm_area_desc *desc) break; case MMAP_REMAP_PFN: case MMAP_IO_REMAP_PFN: + case MMAP_SIMPLE_IO_REMAP: WARN_ON_ONCE(1); /* nommu cannot handle these. */ break; } @@ -1456,6 +1460,7 @@ int mmap_action_complete(struct vm_area_struct *vma, break; case MMAP_REMAP_PFN: case MMAP_IO_REMAP_PFN: + case MMAP_SIMPLE_IO_REMAP: WARN_ON_ONCE(1); /* nommu cannot handle this. */ err = -EINVAL; diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 1fb7bcae4f31..b31207bbe10d 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -453,6 +453,7 @@ enum mmap_action_type { MMAP_NOTHING, /* Mapping is complete, no further action. */ MMAP_REMAP_PFN, /* Remap PFN range. */ MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ + MMAP_SIMPLE_IO_REMAP, /* I/O remap with guardrails. */ }; /* @@ -461,13 +462,16 @@ enum mmap_action_type { */ struct mmap_action { union { - /* Remap range. */ struct { unsigned long start; unsigned long start_pfn; unsigned long size; pgprot_t pgprot; } remap; + struct { + phys_addr_t start_phys_addr; + unsigned long size; + } simple_ioremap; }; enum mmap_action_type type; -- cgit v1.2.3 From 668937b7b2256f4b2a982e8f69b07d9ee8f81d36 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:43 +0000 Subject: mm: allow handling of stacked mmap_prepare hooks in more drivers While the conversion of mmap hooks to mmap_prepare is underway, we will encounter situations where mmap hooks need to invoke nested mmap_prepare hooks. The nesting of mmap hooks is termed 'stacking'. In order to flexibly facilitate the conversion of custom mmap hooks in drivers which stack, we must split up the existing __compat_vma_mmap() function into two separate functions: * compat_set_desc_from_vma() - This allows the setting of a vm_area_desc object's fields to the relevant fields of a VMA. * __compat_vma_mmap() - Once an mmap_prepare hook has been executed upon a vm_area_desc object, this function performs any mmap actions specified by the mmap_prepare hook and then invokes its vm_ops->mapped() hook if any were specified. In ordinary cases, where a file's f_op->mmap_prepare() hook simply needs to be invoked in a stacked mmap() hook, compat_vma_mmap() can be used. However some drivers define their own nested hooks, which are invoked in turn by another hook. A concrete example is vmbus_channel->mmap_ring_buffer(), which is invoked in turn by bin_attribute->mmap(): vmbus_channel->mmap_ring_buffer() has a signature of: int (*mmap_ring_buffer)(struct vmbus_channel *channel, struct vm_area_struct *vma); And bin_attribute->mmap() has a signature of: int (*mmap)(struct file *, struct kobject *, const struct bin_attribute *attr, struct vm_area_struct *vma); And so compat_vma_mmap() cannot be used here for incremental conversion of hooks from mmap() to mmap_prepare(). There are many such instances like this, where conversion to mmap_prepare would otherwise cascade to a huge change set due to nesting of this kind. The changes in this patch mean we could now instead convert vmbus_channel->mmap_ring_buffer() to vmbus_channel->mmap_prepare_ring_buffer(), and implement something like: struct vm_area_desc desc; int err; compat_set_desc_from_vma(&desc, file, vma); err = channel->mmap_prepare_ring_buffer(channel, &desc); if (err) return err; return __compat_vma_mmap(&desc, vma); Allowing us to incrementally update this logic, and other logic like it. Unfortunately, as part of this change, we need to be able to flexibly assign to the VMA descriptor, so have to remove some of the const declarations within the structure. Also update the VMA tests to reflect the changes. Link: https://lkml.kernel.org/r/24aac3019dd34740e788d169fccbe3c62781e648.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- include/linux/fs.h | 3 + include/linux/mm_types.h | 4 +- mm/util.c | 119 +++++++++++++++++++++++++++++----------- mm/vma.h | 2 +- tools/testing/vma/include/dup.h | 68 +++++++++++++++-------- 5 files changed, 136 insertions(+), 60 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index c390f5c667e3..0bdccfa70b44 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2058,6 +2058,9 @@ static inline bool can_mmap_file(struct file *file) return true; } +void compat_set_desc_from_vma(struct vm_area_desc *desc, const struct file *file, + const struct vm_area_struct *vma); +int __compat_vma_mmap(struct vm_area_desc *desc, struct vm_area_struct *vma); int compat_vma_mmap(struct file *file, struct vm_area_struct *vma); int __vma_check_mmap_hook(struct vm_area_struct *vma); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 91a3db174d78..b702c63bf0e0 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -891,8 +891,8 @@ static __always_inline bool vma_flags_empty(const vma_flags_t *flags) */ struct vm_area_desc { /* Immutable state. */ - const struct mm_struct *const mm; - struct file *const file; /* May vary from vm_file in stacked callers. */ + struct mm_struct *mm; + struct file *file; /* May vary from vm_file in stacked callers. */ unsigned long start; unsigned long end; diff --git a/mm/util.c b/mm/util.c index 9a27d33273fd..5ae20876ef2c 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1163,38 +1163,78 @@ void flush_dcache_folio(struct folio *folio) EXPORT_SYMBOL(flush_dcache_folio); #endif -static int __compat_vma_mmap(struct file *file, struct vm_area_struct *vma) -{ - struct vm_area_desc desc = { - .mm = vma->vm_mm, - .file = file, - .start = vma->vm_start, - .end = vma->vm_end, - - .pgoff = vma->vm_pgoff, - .vm_file = vma->vm_file, - .vma_flags = vma->flags, - .page_prot = vma->vm_page_prot, - - .action.type = MMAP_NOTHING, /* Default */ - }; - struct mmap_action *action = &desc.action; - int err; +/** + * compat_set_desc_from_vma() - assigns VMA descriptor @desc fields from a VMA. + * @desc: A VMA descriptor whose fields need to be set. + * @file: The file object describing the file being mmap()'d. + * @vma: The VMA whose fields we wish to assign to @desc. + * + * This is a compatibility function to allow an mmap() hook to call + * mmap_prepare() hooks when drivers nest these. This function specifically + * allows the construction of a vm_area_desc value, @desc, from a VMA @vma for + * the purposes of doing this. + * + * Once the conversion of drivers is complete this function will no longer be + * required and will be removed. + */ +void compat_set_desc_from_vma(struct vm_area_desc *desc, + const struct file *file, + const struct vm_area_struct *vma) +{ + memset(desc, 0, sizeof(*desc)); - err = vfs_mmap_prepare(file, &desc); - if (err) - return err; + desc->mm = vma->vm_mm; + desc->file = (struct file *)file; + desc->start = vma->vm_start; + desc->end = vma->vm_end; - err = mmap_action_prepare(&desc); - if (err) - return err; + desc->pgoff = vma->vm_pgoff; + desc->vm_file = vma->vm_file; + desc->vma_flags = vma->flags; + desc->page_prot = vma->vm_page_prot; - /* being invoked from .mmap means we don't have to enforce this. */ - action->hide_from_rmap_until_complete = false; + /* Default. */ + desc->action.type = MMAP_NOTHING; +} +EXPORT_SYMBOL(compat_set_desc_from_vma); + +/** + * __compat_vma_mmap() - Similar to compat_vma_mmap(), only it allows + * flexibility as to how the mmap_prepare callback is invoked, which is useful + * for drivers which invoke nested mmap_prepare callbacks in an mmap() hook. + * @desc: A VMA descriptor upon which an mmap_prepare() hook has already been + * executed. + * @vma: The VMA to which @desc should be applied. + * + * The function assumes that you have obtained a VMA descriptor @desc from + * compat_set_desc_from_vma(), and already executed the mmap_prepare() hook upon + * it. + * + * It then performs any specified mmap actions, and invokes the vm_ops->mapped() + * hook if one is present. + * + * See the description of compat_vma_mmap() for more details. + * + * Once the conversion of drivers is complete this function will no longer be + * required and will be removed. + * + * Returns: 0 on success or error. + */ +int __compat_vma_mmap(struct vm_area_desc *desc, + struct vm_area_struct *vma) +{ + int err; - set_vma_from_desc(vma, &desc); - return mmap_action_complete(vma, action); + /* Perform any preparatory tasks for mmap action. */ + err = mmap_action_prepare(desc); + if (err) + return err; + /* Update the VMA from the descriptor. */ + compat_set_vma_from_desc(vma, desc); + /* Complete any specified mmap actions. */ + return mmap_action_complete(vma, &desc->action); } +EXPORT_SYMBOL(__compat_vma_mmap); /** * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an @@ -1203,10 +1243,10 @@ static int __compat_vma_mmap(struct file *file, struct vm_area_struct *vma) * @vma: The VMA to apply the .mmap_prepare() hook to. * * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain - * stacked filesystems invoke a nested mmap hook of an underlying file. + * stacked drivers invoke a nested mmap hook of an underlying file. * - * Until all filesystems are converted to use .mmap_prepare(), we must be - * conservative and continue to invoke these stacked filesystems using the + * Until all drivers are converted to use .mmap_prepare(), we must be + * conservative and continue to invoke these stacked drivers using the * deprecated .mmap() hook. * * However we have a problem if the underlying file system possesses an @@ -1217,14 +1257,27 @@ static int __compat_vma_mmap(struct file *file, struct vm_area_struct *vma) * establishes a struct vm_area_desc descriptor, passes to the underlying * .mmap_prepare() hook and applies any changes performed by it. * - * Once the conversion of filesystems is complete this function will no longer - * be required and will be removed. + * Once the conversion of drivers is complete this function will no longer be + * required and will be removed. * * Returns: 0 on success or error. */ int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) { - return __compat_vma_mmap(file, vma); + struct vm_area_desc desc; + struct mmap_action *action; + int err; + + compat_set_desc_from_vma(&desc, file, vma); + err = vfs_mmap_prepare(file, &desc); + if (err) + return err; + action = &desc.action; + + /* being invoked from .mmmap means we don't have to enforce this. */ + action->hide_from_rmap_until_complete = false; + + return __compat_vma_mmap(&desc, vma); } EXPORT_SYMBOL(compat_vma_mmap); diff --git a/mm/vma.h b/mm/vma.h index 1bfe7e47f6be..8e4b61a7304c 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -300,7 +300,7 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi, * f_op->mmap() but which might have an underlying file system which implements * f_op->mmap_prepare(). */ -static inline void set_vma_from_desc(struct vm_area_struct *vma, +static inline void compat_set_vma_from_desc(struct vm_area_struct *vma, struct vm_area_desc *desc) { /* diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index b31207bbe10d..ecd47d0f7d17 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -519,8 +519,8 @@ enum vma_operation { */ struct vm_area_desc { /* Immutable state. */ - const struct mm_struct *const mm; - struct file *const file; /* May vary from vm_file in stacked callers. */ + struct mm_struct *mm; + struct file *file; /* May vary from vm_file in stacked callers. */ unsigned long start; unsigned long end; @@ -1278,50 +1278,70 @@ static inline void vma_set_anonymous(struct vm_area_struct *vma) } /* Declared in vma.h. */ -static inline void set_vma_from_desc(struct vm_area_struct *vma, +static inline void compat_set_vma_from_desc(struct vm_area_struct *vma, struct vm_area_desc *desc); -static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc) +static inline void compat_set_desc_from_vma(struct vm_area_desc *desc, + const struct file *file, + const struct vm_area_struct *vma) { - return file->f_op->mmap_prepare(desc); + memset(desc, 0, sizeof(*desc)); + + desc->mm = vma->vm_mm; + desc->file = (struct file *)file; + desc->start = vma->vm_start; + desc->end = vma->vm_end; + + desc->pgoff = vma->vm_pgoff; + desc->vm_file = vma->vm_file; + desc->vma_flags = vma->flags; + desc->page_prot = vma->vm_page_prot; + + /* Default. */ + desc->action.type = MMAP_NOTHING; } -static inline unsigned long vma_pages(struct vm_area_struct *vma) +static inline unsigned long vma_pages(const struct vm_area_struct *vma) { return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } -static inline int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) +static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc) { - struct vm_area_desc desc = { - .mm = vma->vm_mm, - .file = file, - .start = vma->vm_start, - .end = vma->vm_end, - - .pgoff = vma->vm_pgoff, - .vm_file = vma->vm_file, - .vma_flags = vma->flags, - .page_prot = vma->vm_page_prot, + return file->f_op->mmap_prepare(desc); +} - .action.type = MMAP_NOTHING, /* Default */ - }; - struct mmap_action *action = &desc.action; +static inline int __compat_vma_mmap(struct vm_area_desc *desc, + struct vm_area_struct *vma) +{ int err; - err = vfs_mmap_prepare(file, &desc); + /* Perform any preparatory tasks for mmap action. */ + err = mmap_action_prepare(desc); if (err) return err; + /* Update the VMA from the descriptor. */ + compat_set_vma_from_desc(vma, desc); + /* Complete any specified mmap actions. */ + return mmap_action_complete(vma, &desc->action); +} - err = mmap_action_prepare(&desc); +static inline int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct vm_area_desc desc; + struct mmap_action *action; + int err; + + compat_set_desc_from_vma(&desc, file, vma); + err = vfs_mmap_prepare(file, &desc); if (err) return err; + action = &desc.action; /* being invoked from .mmmap means we don't have to enforce this. */ action->hide_from_rmap_until_complete = false; - set_vma_from_desc(vma, &desc); - return mmap_action_complete(vma, action); + return __compat_vma_mmap(&desc, vma); } static inline void vma_iter_init(struct vma_iterator *vmi, -- cgit v1.2.3 From f98cb7ca4aa44645347771c2c2a9724bc210c49e Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:44 +0000 Subject: drivers: hv: vmbus: replace deprecated mmap hook with mmap_prepare The f_op->mmap interface is deprecated, so update the vmbus driver to use its successor, mmap_prepare. This updates all callbacks which referenced the function pointer hv_mmap_ring_buffer to instead reference hv_mmap_prepare_ring_buffer, utilising the newly introduced compat_set_desc_from_vma() and __compat_vma_mmap() to be able to implement this change. The UIO HV generic driver is the only user of hv_create_ring_sysfs(), which is the only function which references vmbus_channel->mmap_prepare_ring_buffer which, in turn, is the only external interface to hv_mmap_prepare_ring_buffer. This patch therefore updates this caller to use mmap_prepare instead, which also previously used vm_iomap_memory(), so this change replaces it with its mmap_prepare equivalent, mmap_action_simple_ioremap(). [akpm@linux-foundation.org: restore struct vmbus_channel comment, per Michael Kelley] Link: https://lkml.kernel.org/r/05467cb62267d750e5c770147517d4df0246cda6.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Michael Kelley Tested-by: Michael Kelley Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- drivers/hv/hyperv_vmbus.h | 4 ++-- drivers/hv/vmbus_drv.c | 31 +++++++++++++++++++------------ drivers/uio/uio_hv_generic.c | 11 ++++++----- include/linux/hyperv.h | 4 ++-- 4 files changed, 29 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 7bd8f8486e85..31f576464f18 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -545,8 +545,8 @@ static inline int hv_debug_add_dev_dir(struct hv_device *dev) /* Create and remove sysfs entry for memory mapped ring buffers for a channel */ int hv_create_ring_sysfs(struct vmbus_channel *channel, - int (*hv_mmap_ring_buffer)(struct vmbus_channel *channel, - struct vm_area_struct *vma)); + int (*hv_mmap_prepare_ring_buffer)(struct vmbus_channel *channel, + struct vm_area_desc *desc)); int hv_remove_ring_sysfs(struct vmbus_channel *channel); #endif /* _HYPERV_VMBUS_H */ diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index bc4fc1951ae1..45625487ba36 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1951,12 +1951,19 @@ static int hv_mmap_ring_buffer_wrapper(struct file *filp, struct kobject *kobj, struct vm_area_struct *vma) { struct vmbus_channel *channel = container_of(kobj, struct vmbus_channel, kobj); + struct vm_area_desc desc; + int err; /* - * hv_(create|remove)_ring_sysfs implementation ensures that mmap_ring_buffer - * is not NULL. + * hv_(create|remove)_ring_sysfs implementation ensures that + * mmap_prepare_ring_buffer is not NULL. */ - return channel->mmap_ring_buffer(channel, vma); + compat_set_desc_from_vma(&desc, filp, vma); + err = channel->mmap_prepare_ring_buffer(channel, &desc); + if (err) + return err; + + return __compat_vma_mmap(&desc, vma); } static struct bin_attribute chan_attr_ring_buffer = { @@ -2048,13 +2055,13 @@ static const struct kobj_type vmbus_chan_ktype = { /** * hv_create_ring_sysfs() - create "ring" sysfs entry corresponding to ring buffers for a channel. * @channel: Pointer to vmbus_channel structure - * @hv_mmap_ring_buffer: function pointer for initializing the function to be called on mmap of + * @hv_mmap_prepare_ring_buffer: function pointer for initializing the function to be called on mmap * channel's "ring" sysfs node, which is for the ring buffer of that channel. * Function pointer is of below type: - * int (*hv_mmap_ring_buffer)(struct vmbus_channel *channel, - * struct vm_area_struct *vma)) - * This has a pointer to the channel and a pointer to vm_area_struct, - * used for mmap, as arguments. + * int (*hv_mmap_prepare_ring_buffer)(struct vmbus_channel *channel, + * struct vm_area_desc *desc)) + * This has a pointer to the channel and a pointer to vm_area_desc, + * used for mmap_prepare, as arguments. * * Sysfs node for ring buffer of a channel is created along with other fields, however its * visibility is disabled by default. Sysfs creation needs to be controlled when the use-case @@ -2071,12 +2078,12 @@ static const struct kobj_type vmbus_chan_ktype = { * Returns 0 on success or error code on failure. */ int hv_create_ring_sysfs(struct vmbus_channel *channel, - int (*hv_mmap_ring_buffer)(struct vmbus_channel *channel, - struct vm_area_struct *vma)) + int (*hv_mmap_prepare_ring_buffer)(struct vmbus_channel *channel, + struct vm_area_desc *desc)) { struct kobject *kobj = &channel->kobj; - channel->mmap_ring_buffer = hv_mmap_ring_buffer; + channel->mmap_prepare_ring_buffer = hv_mmap_prepare_ring_buffer; channel->ring_sysfs_visible = true; return sysfs_update_group(kobj, &vmbus_chan_group); @@ -2098,7 +2105,7 @@ int hv_remove_ring_sysfs(struct vmbus_channel *channel) channel->ring_sysfs_visible = false; ret = sysfs_update_group(kobj, &vmbus_chan_group); - channel->mmap_ring_buffer = NULL; + channel->mmap_prepare_ring_buffer = NULL; return ret; } EXPORT_SYMBOL_GPL(hv_remove_ring_sysfs); diff --git a/drivers/uio/uio_hv_generic.c b/drivers/uio/uio_hv_generic.c index 3f8e2e27697f..29ec2d15ada8 100644 --- a/drivers/uio/uio_hv_generic.c +++ b/drivers/uio/uio_hv_generic.c @@ -154,15 +154,16 @@ static void hv_uio_rescind(struct vmbus_channel *channel) * The ring buffer is allocated as contiguous memory by vmbus_open */ static int -hv_uio_ring_mmap(struct vmbus_channel *channel, struct vm_area_struct *vma) +hv_uio_ring_mmap_prepare(struct vmbus_channel *channel, struct vm_area_desc *desc) { void *ring_buffer = page_address(channel->ringbuffer_page); if (channel->state != CHANNEL_OPENED_STATE) return -ENODEV; - return vm_iomap_memory(vma, virt_to_phys(ring_buffer), - channel->ringbuffer_pagecount << PAGE_SHIFT); + mmap_action_simple_ioremap(desc, virt_to_phys(ring_buffer), + channel->ringbuffer_pagecount << PAGE_SHIFT); + return 0; } /* Callback from VMBUS subsystem when new channel created. */ @@ -183,7 +184,7 @@ hv_uio_new_channel(struct vmbus_channel *new_sc) } set_channel_read_mode(new_sc, HV_CALL_ISR); - ret = hv_create_ring_sysfs(new_sc, hv_uio_ring_mmap); + ret = hv_create_ring_sysfs(new_sc, hv_uio_ring_mmap_prepare); if (ret) { dev_err(device, "sysfs create ring bin file failed; %d\n", ret); vmbus_close(new_sc); @@ -366,7 +367,7 @@ hv_uio_probe(struct hv_device *dev, * or decoupled from uio_hv_generic probe. Userspace programs can make use of inotify * APIs to make sure that ring is created. */ - hv_create_ring_sysfs(channel, hv_uio_ring_mmap); + hv_create_ring_sysfs(channel, hv_uio_ring_mmap_prepare); hv_set_drvdata(dev, pdata); diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index dfc516c1c719..a26fb8e7cedf 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1015,8 +1015,8 @@ struct vmbus_channel { /* The max size of a packet on this channel */ u32 max_pkt_size; - /* function to mmap ring buffer memory to the channel's sysfs ring attribute */ - int (*mmap_ring_buffer)(struct vmbus_channel *channel, struct vm_area_struct *vma); + /* function to mmap ring buffer memory to the channel's sysfs ring attribute */ + int (*mmap_prepare_ring_buffer)(struct vmbus_channel *channel, struct vm_area_desc *desc); /* boolean to control visibility of sysfs for ring buffer */ bool ring_sysfs_visible; -- cgit v1.2.3 From 933f05f58ac6014eaac387d22a76ace8606891d1 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:45 +0000 Subject: uio: replace deprecated mmap hook with mmap_prepare in uio_info The f_op->mmap interface is deprecated, so update uio_info to use its successor, mmap_prepare. Therefore, replace the uio_info->mmap hook with a new uio_info->mmap_prepare hook, and update its one user, target_core_user, to both specify this new mmap_prepare hook and also to use the new vm_ops->mapped() hook to continue to maintain a correct udev->kref refcount. Then update uio_mmap() to utilise the mmap_prepare compatibility layer to invoke this callback from the uio mmap invocation. Link: https://lkml.kernel.org/r/157583e4477705b496896c7acd4ac88a937b8fa6.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- drivers/target/target_core_user.c | 26 ++++++++++++++++++-------- drivers/uio/uio.c | 10 ++++++++-- include/linux/uio_driver.h | 4 ++-- 3 files changed, 28 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index af95531ddd35..edc2afd5f4ee 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -1860,6 +1860,17 @@ static struct page *tcmu_try_get_data_page(struct tcmu_dev *udev, uint32_t dpi) return NULL; } +static int tcmu_vma_mapped(unsigned long start, unsigned long end, pgoff_t pgoff, + const struct file *file, void **vm_private_data) +{ + struct tcmu_dev *udev = *vm_private_data; + + pr_debug("vma_mapped\n"); + + kref_get(&udev->kref); + return 0; +} + static void tcmu_vma_open(struct vm_area_struct *vma) { struct tcmu_dev *udev = vma->vm_private_data; @@ -1919,26 +1930,25 @@ static vm_fault_t tcmu_vma_fault(struct vm_fault *vmf) } static const struct vm_operations_struct tcmu_vm_ops = { + .mapped = tcmu_vma_mapped, .open = tcmu_vma_open, .close = tcmu_vma_close, .fault = tcmu_vma_fault, }; -static int tcmu_mmap(struct uio_info *info, struct vm_area_struct *vma) +static int tcmu_mmap_prepare(struct uio_info *info, struct vm_area_desc *desc) { struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info); - vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); - vma->vm_ops = &tcmu_vm_ops; + vma_desc_set_flags(desc, VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT); + desc->vm_ops = &tcmu_vm_ops; - vma->vm_private_data = udev; + desc->private_data = udev; /* Ensure the mmap is exactly the right size */ - if (vma_pages(vma) != udev->mmap_pages) + if (vma_desc_pages(desc) != udev->mmap_pages) return -EINVAL; - tcmu_vma_open(vma); - return 0; } @@ -2253,7 +2263,7 @@ static int tcmu_configure_device(struct se_device *dev) info->irqcontrol = tcmu_irqcontrol; info->irq = UIO_IRQ_CUSTOM; - info->mmap = tcmu_mmap; + info->mmap_prepare = tcmu_mmap_prepare; info->open = tcmu_open; info->release = tcmu_release; diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index 5a4998e2caf8..1e4ade78ed84 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -850,8 +850,14 @@ static int uio_mmap(struct file *filep, struct vm_area_struct *vma) goto out; } - if (idev->info->mmap) { - ret = idev->info->mmap(idev->info, vma); + if (idev->info->mmap_prepare) { + struct vm_area_desc desc; + + compat_set_desc_from_vma(&desc, filep, vma); + ret = idev->info->mmap_prepare(idev->info, &desc); + if (ret) + goto out; + ret = __compat_vma_mmap(&desc, vma); goto out; } diff --git a/include/linux/uio_driver.h b/include/linux/uio_driver.h index 334641e20fb1..02eaac47ac44 100644 --- a/include/linux/uio_driver.h +++ b/include/linux/uio_driver.h @@ -97,7 +97,7 @@ struct uio_device { * @irq_flags: flags for request_irq() * @priv: optional private data * @handler: the device's irq handler - * @mmap: mmap operation for this uio device + * @mmap_prepare: mmap_prepare operation for this uio device * @open: open operation for this uio device * @release: release operation for this uio device * @irqcontrol: disable/enable irqs when 0/1 is written to /dev/uioX @@ -112,7 +112,7 @@ struct uio_info { unsigned long irq_flags; void *priv; irqreturn_t (*handler)(int irq, struct uio_info *dev_info); - int (*mmap)(struct uio_info *info, struct vm_area_struct *vma); + int (*mmap_prepare)(struct uio_info *info, struct vm_area_desc *desc); int (*open)(struct uio_info *info, struct inode *inode); int (*release)(struct uio_info *info, struct inode *inode); int (*irqcontrol)(struct uio_info *info, s32 irq_on); -- cgit v1.2.3 From 62c65fd740e979a3967db08971b93aefcec510d4 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:46 +0000 Subject: mm: add mmap_action_map_kernel_pages[_full]() A user can invoke mmap_action_map_kernel_pages() to specify that the mapping should map kernel pages starting from desc->start of a specified number of pages specified in an array. In order to implement this, adjust mmap_action_prepare() to be able to return an error code, as it makes sense to assert that the specified parameters are valid as quickly as possible as well as updating the VMA flags to include VMA_MIXEDMAP_BIT as necessary. This provides an mmap_prepare equivalent of vm_insert_pages(). We additionally update the existing vm_insert_pages() code to use range_in_vma() and add a new range_in_vma_desc() helper function for the mmap_prepare case, sharing the code between the two in range_is_subset(). We add both mmap_action_map_kernel_pages() and mmap_action_map_kernel_pages_full() to allow for both partial and full VMA mappings. We update the documentation to reflect the new features. Finally, we update the VMA tests accordingly to reflect the changes. Link: https://lkml.kernel.org/r/926ac961690d856e67ec847bee2370ab3c6b9046.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Suren Baghdasaryan Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- Documentation/filesystems/mmap_prepare.rst | 8 +++ include/linux/mm.h | 95 +++++++++++++++++++++++++++++- include/linux/mm_types.h | 7 +++ mm/memory.c | 42 +++++++++++-- mm/util.c | 7 +++ tools/testing/vma/include/dup.h | 7 +++ 6 files changed, 160 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/Documentation/filesystems/mmap_prepare.rst b/Documentation/filesystems/mmap_prepare.rst index 14bb057be564..82c99c95ad85 100644 --- a/Documentation/filesystems/mmap_prepare.rst +++ b/Documentation/filesystems/mmap_prepare.rst @@ -156,5 +156,13 @@ pointer. These are: * mmap_action_simple_ioremap() - Sets up an I/O remap from a specified physical address and over a specified length. +* mmap_action_map_kernel_pages() - Maps a specified array of `struct page` + pointers in the VMA from a specific offset. + +* mmap_action_map_kernel_pages_full() - Maps a specified array of `struct + page` pointers over the entire VMA. The caller must ensure there are + sufficient entries in the page array to cover the entire range of the + described VMA. + **NOTE:** The ``action`` field should never normally be manipulated directly, rather you ought to use one of these helpers. diff --git a/include/linux/mm.h b/include/linux/mm.h index 53b21de40f87..61dff7f03554 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2905,7 +2905,7 @@ static inline bool folio_maybe_mapped_shared(struct folio *folio) * The caller must add any reference (e.g., from folio_try_get()) it might be * holding itself to the result. * - * Returns the expected folio refcount. + * Returns: the expected folio refcount. */ static inline int folio_expected_ref_count(const struct folio *folio) { @@ -4348,6 +4348,45 @@ static inline void mmap_action_simple_ioremap(struct vm_area_desc *desc, action->type = MMAP_SIMPLE_IO_REMAP; } +/** + * mmap_action_map_kernel_pages - helper for mmap_prepare hook to specify that + * @num kernel pages contained in the @pages array should be mapped to userland + * starting at virtual address @start. + * @desc: The VMA descriptor for the VMA requiring kernel pags to be mapped. + * @start: The virtual address from which to map them. + * @pages: An array of struct page pointers describing the memory to map. + * @nr_pages: The number of entries in the @pages aray. + */ +static inline void mmap_action_map_kernel_pages(struct vm_area_desc *desc, + unsigned long start, struct page **pages, + unsigned long nr_pages) +{ + struct mmap_action *action = &desc->action; + + action->type = MMAP_MAP_KERNEL_PAGES; + action->map_kernel.start = start; + action->map_kernel.pages = pages; + action->map_kernel.nr_pages = nr_pages; + action->map_kernel.pgoff = desc->pgoff; +} + +/** + * mmap_action_map_kernel_pages_full - helper for mmap_prepare hook to specify that + * kernel pages contained in the @pages array should be mapped to userland + * from @desc->start to @desc->end. + * @desc: The VMA descriptor for the VMA requiring kernel pags to be mapped. + * @pages: An array of struct page pointers describing the memory to map. + * + * The caller must ensure that @pages contains sufficient entries to cover the + * entire range described by @desc. + */ +static inline void mmap_action_map_kernel_pages_full(struct vm_area_desc *desc, + struct page **pages) +{ + mmap_action_map_kernel_pages(desc, desc->start, pages, + vma_desc_pages(desc)); +} + int mmap_action_prepare(struct vm_area_desc *desc); int mmap_action_complete(struct vm_area_struct *vma, struct mmap_action *action); @@ -4364,10 +4403,59 @@ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, return vma; } +/** + * range_is_subset - Is the specified inner range a subset of the outer range? + * @outer_start: The start of the outer range. + * @outer_end: The exclusive end of the outer range. + * @inner_start: The start of the inner range. + * @inner_end: The exclusive end of the inner range. + * + * Returns: %true if [inner_start, inner_end) is a subset of [outer_start, + * outer_end), otherwise %false. + */ +static inline bool range_is_subset(unsigned long outer_start, + unsigned long outer_end, + unsigned long inner_start, + unsigned long inner_end) +{ + return outer_start <= inner_start && inner_end <= outer_end; +} + +/** + * range_in_vma - is the specified [@start, @end) range a subset of the VMA? + * @vma: The VMA against which we want to check [@start, @end). + * @start: The start of the range we wish to check. + * @end: The exclusive end of the range we wish to check. + * + * Returns: %true if [@start, @end) is a subset of [@vma->vm_start, + * @vma->vm_end), %false otherwise. + */ static inline bool range_in_vma(const struct vm_area_struct *vma, unsigned long start, unsigned long end) { - return (vma && vma->vm_start <= start && end <= vma->vm_end); + if (!vma) + return false; + + return range_is_subset(vma->vm_start, vma->vm_end, start, end); +} + +/** + * range_in_vma_desc - is the specified [@start, @end) range a subset of the VMA + * described by @desc, a VMA descriptor? + * @desc: The VMA descriptor against which we want to check [@start, @end). + * @start: The start of the range we wish to check. + * @end: The exclusive end of the range we wish to check. + * + * Returns: %true if [@start, @end) is a subset of [@desc->start, @desc->end), + * %false otherwise. + */ +static inline bool range_in_vma_desc(const struct vm_area_desc *desc, + unsigned long start, unsigned long end) +{ + if (!desc) + return false; + + return range_is_subset(desc->start, desc->end, start, end); } #ifdef CONFIG_MMU @@ -4411,6 +4499,9 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num); +int map_kernel_pages_prepare(struct vm_area_desc *desc); +int map_kernel_pages_complete(struct vm_area_struct *vma, + struct mmap_action *action); int vm_map_pages(struct vm_area_struct *vma, struct page **pages, unsigned long num); int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index b702c63bf0e0..a308e2c23b82 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -815,6 +815,7 @@ enum mmap_action_type { MMAP_REMAP_PFN, /* Remap PFN range. */ MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ MMAP_SIMPLE_IO_REMAP, /* I/O remap with guardrails. */ + MMAP_MAP_KERNEL_PAGES, /* Map kernel page range from array. */ }; /* @@ -833,6 +834,12 @@ struct mmap_action { phys_addr_t start_phys_addr; unsigned long size; } simple_ioremap; + struct { + unsigned long start; + struct page **pages; + unsigned long nr_pages; + pgoff_t pgoff; + } map_kernel; }; enum mmap_action_type type; diff --git a/mm/memory.c b/mm/memory.c index c1c323512939..5d032b5293c6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2484,13 +2484,14 @@ out: int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num) { - const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1; + const unsigned long nr_pages = *num; + const unsigned long end = addr + PAGE_SIZE * nr_pages; - if (addr < vma->vm_start || end_addr >= vma->vm_end) + if (!range_in_vma(vma, addr, end)) return -EFAULT; if (!(vma->vm_flags & VM_MIXEDMAP)) { - BUG_ON(mmap_read_trylock(vma->vm_mm)); - BUG_ON(vma->vm_flags & VM_PFNMAP); + VM_WARN_ON_ONCE(mmap_read_trylock(vma->vm_mm)); + VM_WARN_ON_ONCE(vma->vm_flags & VM_PFNMAP); vm_flags_set(vma, VM_MIXEDMAP); } /* Defer page refcount checking till we're about to map that page. */ @@ -2498,6 +2499,39 @@ int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vm_insert_pages); +int map_kernel_pages_prepare(struct vm_area_desc *desc) +{ + const struct mmap_action *action = &desc->action; + const unsigned long addr = action->map_kernel.start; + unsigned long nr_pages, end; + + if (!vma_desc_test(desc, VMA_MIXEDMAP_BIT)) { + VM_WARN_ON_ONCE(mmap_read_trylock(desc->mm)); + VM_WARN_ON_ONCE(vma_desc_test(desc, VMA_PFNMAP_BIT)); + vma_desc_set_flags(desc, VMA_MIXEDMAP_BIT); + } + + nr_pages = action->map_kernel.nr_pages; + end = addr + PAGE_SIZE * nr_pages; + if (!range_in_vma_desc(desc, addr, end)) + return -EFAULT; + + return 0; +} +EXPORT_SYMBOL(map_kernel_pages_prepare); + +int map_kernel_pages_complete(struct vm_area_struct *vma, + struct mmap_action *action) +{ + unsigned long nr_pages; + + nr_pages = action->map_kernel.nr_pages; + return insert_pages(vma, action->map_kernel.start, + action->map_kernel.pages, + &nr_pages, vma->vm_page_prot); +} +EXPORT_SYMBOL(map_kernel_pages_complete); + /** * vm_insert_page - insert single page into user vma * @vma: user vma to map to diff --git a/mm/util.c b/mm/util.c index 5ae20876ef2c..f063fd4de1e8 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1448,6 +1448,8 @@ int mmap_action_prepare(struct vm_area_desc *desc) return io_remap_pfn_range_prepare(desc); case MMAP_SIMPLE_IO_REMAP: return simple_ioremap_prepare(desc); + case MMAP_MAP_KERNEL_PAGES: + return map_kernel_pages_prepare(desc); } WARN_ON_ONCE(1); @@ -1475,6 +1477,9 @@ int mmap_action_complete(struct vm_area_struct *vma, case MMAP_REMAP_PFN: err = remap_pfn_range_complete(vma, action); break; + case MMAP_MAP_KERNEL_PAGES: + err = map_kernel_pages_complete(vma, action); + break; case MMAP_IO_REMAP_PFN: case MMAP_SIMPLE_IO_REMAP: /* Should have been delegated. */ @@ -1495,6 +1500,7 @@ int mmap_action_prepare(struct vm_area_desc *desc) case MMAP_REMAP_PFN: case MMAP_IO_REMAP_PFN: case MMAP_SIMPLE_IO_REMAP: + case MMAP_MAP_KERNEL_PAGES: WARN_ON_ONCE(1); /* nommu cannot handle these. */ break; } @@ -1514,6 +1520,7 @@ int mmap_action_complete(struct vm_area_struct *vma, case MMAP_REMAP_PFN: case MMAP_IO_REMAP_PFN: case MMAP_SIMPLE_IO_REMAP: + case MMAP_MAP_KERNEL_PAGES: WARN_ON_ONCE(1); /* nommu cannot handle this. */ err = -EINVAL; diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index ecd47d0f7d17..b4864aad2db0 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -454,6 +454,7 @@ enum mmap_action_type { MMAP_REMAP_PFN, /* Remap PFN range. */ MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ MMAP_SIMPLE_IO_REMAP, /* I/O remap with guardrails. */ + MMAP_MAP_KERNEL_PAGES, /* Map kernel page range from an array. */ }; /* @@ -472,6 +473,12 @@ struct mmap_action { phys_addr_t start_phys_addr; unsigned long size; } simple_ioremap; + struct { + unsigned long start; + struct page **pages; + unsigned long nr_pages; + pgoff_t pgoff; + } map_kernel; }; enum mmap_action_type type; -- cgit v1.2.3 From c0ea52c18c78c33c68c350eb9d3dcdf8c513254d Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 18:07:18 +0000 Subject: mm/huge_memory: simplify vma_is_specal_huge() Patch series "mm/huge_memory: refactor zap_huge_pmd()", v3. zap_huge_pmd() is overly complicated, clean it up and also add an assert in the case that we encounter a buggy PMD entry that doesn't match expectations. This is motivated by a bug discovered [0] where the PMD entry was none of: * A non-DAX, PFN or mixed map. * The huge zero folio * A present PMD entry * A softleaf entry In zap_huge_pmd(), but due to the bug we manged to reach this code. It is useful to explicitly call this out rather than have an arbitrary NULL pointer dereference happen, which also improves understanding of what's going on. The series goes further to make use of vm_normal_folio_pmd() rather than implementing custom logic for retrieving the folio, and extends softleaf functionality to provide and use an equivalent softleaf function. This patch (of 13): This function is confused - it overloads the term 'special' yet again, checks for DAX but in many cases the code explicitly excludes DAX before invoking the predicate. It also unnecessarily checks for vma->vm_file - this has to be present for a driver to have set VMA_MIXEDMAP_BIT or VMA_PFNMAP_BIT. In fact, a far simpler form of this is to reverse the DAX predicate and return false if DAX is set. This makes sense from the point of view of 'special' as in vm_normal_page(), as DAX actually does potentially have retrievable folios. Also there's no need to have this in mm.h so move it to huge_memory.c. No functional change intended. Link: https://lkml.kernel.org/r/cover.1774029655.git.ljs@kernel.org Link: https://lkml.kernel.org/r/d2b65883dc4895f197c4b4a69fbf27a063463412.1774029655.git.ljs@kernel.org Link: https://lore.kernel.org/all/6b3d7ad7-49e1-407a-903d-3103704160d8@lucifer.local/ [0] Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Suren Baghdasaryan Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Zi Yan Cc: Qi Zheng Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 4 ++-- include/linux/mm.h | 16 ---------------- mm/huge_memory.c | 30 +++++++++++++++++++++++------- 3 files changed, 25 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index bd7f0e1d8094..61fda1672b29 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -83,7 +83,7 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; * file is never split and the MAX_PAGECACHE_ORDER limit does not apply to * it. Same to PFNMAPs where there's neither page* nor pagecache. */ -#define THP_ORDERS_ALL_SPECIAL \ +#define THP_ORDERS_ALL_SPECIAL_DAX \ (BIT(PMD_ORDER) | BIT(PUD_ORDER)) #define THP_ORDERS_ALL_FILE_DEFAULT \ ((BIT(MAX_PAGECACHE_ORDER + 1) - 1) & ~BIT(0)) @@ -92,7 +92,7 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; * Mask of all large folio orders supported for THP. */ #define THP_ORDERS_ALL \ - (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_SPECIAL | THP_ORDERS_ALL_FILE_DEFAULT) + (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_SPECIAL_DAX | THP_ORDERS_ALL_FILE_DEFAULT) enum tva_type { TVA_SMAPS, /* Exposing "THPeligible:" in smaps. */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 61dff7f03554..8260e28205e9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -5068,22 +5068,6 @@ long copy_folio_from_user(struct folio *dst_folio, const void __user *usr_src, bool allow_pagefault); -/** - * vma_is_special_huge - Are transhuge page-table entries considered special? - * @vma: Pointer to the struct vm_area_struct to consider - * - * Whether transhuge page-table entries are considered "special" following - * the definition in vm_normal_page(). - * - * Return: true if transhuge page-table entries should be considered special, - * false otherwise. - */ -static inline bool vma_is_special_huge(const struct vm_area_struct *vma) -{ - return vma_is_dax(vma) || (vma->vm_file && - (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))); -} - #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ #if MAX_NUMNODES > 1 diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1c1a7cf7b209..db390b0098d9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -100,6 +100,14 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) return !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); } +/* If returns true, we are unable to access the VMA's folios. */ +static bool vma_is_special_huge(const struct vm_area_struct *vma) +{ + if (vma_is_dax(vma)) + return false; + return vma_test_any(vma, VMA_PFNMAP_BIT, VMA_MIXEDMAP_BIT); +} + unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, vm_flags_t vm_flags, enum tva_type type, @@ -113,8 +121,8 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, /* Check the intersection of requested and supported orders. */ if (vma_is_anonymous(vma)) supported_orders = THP_ORDERS_ALL_ANON; - else if (vma_is_special_huge(vma)) - supported_orders = THP_ORDERS_ALL_SPECIAL; + else if (vma_is_dax(vma) || vma_is_special_huge(vma)) + supported_orders = THP_ORDERS_ALL_SPECIAL_DAX; else supported_orders = THP_ORDERS_ALL_FILE_DEFAULT; @@ -2415,7 +2423,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb->fullmm); arch_check_zapped_pmd(vma, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); - if (!vma_is_dax(vma) && vma_is_special_huge(vma)) { + if (vma_is_special_huge(vma)) { if (arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); spin_unlock(ptl); @@ -2917,7 +2925,7 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm); arch_check_zapped_pud(vma, orig_pud); tlb_remove_pud_tlb_entry(tlb, pud, addr); - if (!vma_is_dax(vma) && vma_is_special_huge(vma)) { + if (vma_is_special_huge(vma)) { spin_unlock(ptl); /* No zero page support yet */ } else { @@ -3068,7 +3076,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, */ if (arch_needs_pgtable_deposit()) zap_deposited_table(mm, pmd); - if (!vma_is_dax(vma) && vma_is_special_huge(vma)) + if (vma_is_special_huge(vma)) return; if (unlikely(pmd_is_migration_entry(old_pmd))) { const softleaf_t old_entry = softleaf_from_pmd(old_pmd); @@ -4629,8 +4637,16 @@ next: static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma) { - return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) || - is_vm_hugetlb_page(vma); + if (vma_is_dax(vma)) + return true; + if (vma_is_special_huge(vma)) + return true; + if (vma_test(vma, VMA_IO_BIT)) + return true; + if (is_vm_hugetlb_page(vma)) + return true; + + return false; } static int split_huge_pages_pid(int pid, unsigned long vaddr_start, -- cgit v1.2.3 From b92b9d4f699ce1f0ae746ebc69bca329adc07293 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 18:07:20 +0000 Subject: mm/huge_memory: have zap_huge_pmd return a boolean, add kdoc There's no need to use the ancient approach of returning an integer here, just return a boolean. Also update flush_needed to be a boolean, similarly. Also add a kdoc comment describing the function. No functional change intended. Link: https://lkml.kernel.org/r/132274566cd49d2960a2294c36dd2450593dfc55.1774029655.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Baolin Wang Acked-by: Qi Zheng Reviewed-by: Suren Baghdasaryan Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 4 ++-- mm/huge_memory.c | 23 ++++++++++++++++------- 2 files changed, 18 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 61fda1672b29..2949e5acff35 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -27,8 +27,8 @@ static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf); bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long next); -int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr); +bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr); int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr); bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4dfffd6a1bbe..65e554afdf16 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2402,11 +2402,20 @@ static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) mm_dec_nr_ptes(mm); } -int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, +/** + * zap_huge_pmd - Zap a huge THP which is of PMD size. + * @tlb: The MMU gather TLB state associated with the operation. + * @vma: The VMA containing the range to zap. + * @pmd: A pointer to the leaf PMD entry. + * @addr: The virtual address for the range to zap. + * + * Returns: %true on success, %false otherwise. + */ +bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { struct folio *folio = NULL; - int flush_needed = 1; + bool flush_needed = true; spinlock_t *ptl; pmd_t orig_pmd; @@ -2414,7 +2423,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, ptl = __pmd_trans_huge_lock(pmd, vma); if (!ptl) - return 0; + return false; /* * For architectures like ppc64 we look at deposited pgtable * when calling pmdp_huge_get_and_clear. So do the @@ -2429,13 +2438,13 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); spin_unlock(ptl); - return 1; + return true; } if (is_huge_zero_pmd(orig_pmd)) { if (!vma_is_dax(vma) || arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); spin_unlock(ptl); - return 1; + return true; } if (pmd_present(orig_pmd)) { @@ -2449,7 +2458,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, const softleaf_t entry = softleaf_from_pmd(orig_pmd); folio = softleaf_to_folio(entry); - flush_needed = 0; + flush_needed = false; if (!thp_migration_supported()) WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); @@ -2483,7 +2492,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (flush_needed) tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); - return 1; + return true; } #ifndef pmd_move_must_withdraw -- cgit v1.2.3 From 64b7d889d03ce94940d6dd9440c4e74c1108ac78 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 18:07:28 +0000 Subject: mm: add softleaf_is_valid_pmd_entry(), pmd_to_softleaf_folio() Separate pmd_is_valid_softleaf() into separate components, then use the pmd_is_valid_softleaf() predicate to implement pmd_to_softleaf_folio(). This returns the folio associated with a softleaf entry at PMD level. It expects this to be valid for a PMD entry. If CONFIG_DEBUG_VM is set, then assert on this being an invalid entry, and either way return NULL in this case. This lays the ground for further refactorings. Link: https://lkml.kernel.org/r/b677592596274fa3fd701890497948e4b0e07cec.1774029655.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Suren Baghdasaryan Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Qi Zheng Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/leafops.h | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/leafops.h b/include/linux/leafops.h index 05673d3529e7..992cd8bd8ed0 100644 --- a/include/linux/leafops.h +++ b/include/linux/leafops.h @@ -607,7 +607,20 @@ static inline bool pmd_is_migration_entry(pmd_t pmd) } /** - * pmd_is_valid_softleaf() - Is this PMD entry a valid leaf entry? + * softleaf_is_valid_pmd_entry() - Is the specified softleaf entry obtained from + * a PMD one that we support at PMD level? + * @entry: Entry to check. + * Returns: true if the softleaf entry is valid at PMD, otherwise false. + */ +static inline bool softleaf_is_valid_pmd_entry(softleaf_t entry) +{ + /* Only device private, migration entries valid for PMD. */ + return softleaf_is_device_private(entry) || + softleaf_is_migration(entry); +} + +/** + * pmd_is_valid_softleaf() - Is this PMD entry a valid softleaf entry? * @pmd: PMD entry. * * PMD leaf entries are valid only if they are device private or migration @@ -620,9 +633,27 @@ static inline bool pmd_is_valid_softleaf(pmd_t pmd) { const softleaf_t entry = softleaf_from_pmd(pmd); - /* Only device private, migration entries valid for PMD. */ - return softleaf_is_device_private(entry) || - softleaf_is_migration(entry); + return softleaf_is_valid_pmd_entry(entry); +} + +/** + * pmd_to_softleaf_folio() - Convert the PMD entry to a folio. + * @pmd: PMD entry. + * + * The PMD entry is expected to be a valid PMD softleaf entry. + * + * Returns: the folio the softleaf entry references if this is a valid softleaf + * entry, otherwise NULL. + */ +static inline struct folio *pmd_to_softleaf_folio(pmd_t pmd) +{ + const softleaf_t entry = softleaf_from_pmd(pmd); + + if (!softleaf_is_valid_pmd_entry(entry)) { + VM_WARN_ON_ONCE(true); + return NULL; + } + return softleaf_to_folio(entry); } #endif /* CONFIG_MMU */ -- cgit v1.2.3