From 74e2712b14e35a200ba24b0b99e56a2d91499ff0 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Tue, 12 Nov 2024 14:25:36 -0500 Subject: xarray: extract xa_zero_to_null Patch series "xarray: extract __xa_cmpxchg_raw". This series reduces duplication between __xa_cmpxchg and __xa_insert by extracting a new function that does not coerce zero entries to null on the return path. The new function may be used by the upcoming Rust xarray abstraction in its reservation API where it is useful to tell the difference between zero entries and null slots. This patch (of 2): Reduce code duplication by extracting a static inline function that returns its argument if it is non-zero and NULL otherwise. This changes xas_result to check for errors before checking for zero but this cannot change the behavior of existing callers: - __xa_erase: passes the result of xas_store(_, NULL) which cannot fail. - __xa_store: passes the result of xas_store(_, entry) which may fail. xas_store calls xas_create when entry is not NULL which returns NULL on error, which is immediately checked. This should not change observable behavior. - __xa_cmpxchg: passes the result of xas_load(_) which might be zero. This would previously return NULL regardless of the outcome of xas_store but xas_store cannot fail if xas_load returns zero because there is no need to allocate memory. - xa_store_range: same as __xa_erase. Link: https://lkml.kernel.org/r/20241112-xarray-insert-cmpxchg-v1-0-dc2bdd8c4136@gmail.com Link: https://lkml.kernel.org/r/20241112-xarray-insert-cmpxchg-v1-1-dc2bdd8c4136@gmail.com Signed-off-by: Tamir Duberstein Cc: Alice Ryhl Cc: Andreas Hindborg Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- lib/xarray.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'lib/xarray.c') diff --git a/lib/xarray.c b/lib/xarray.c index 32d4bac8c94c..1b8305bffbff 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -435,6 +435,11 @@ static unsigned long max_index(void *entry) return (XA_CHUNK_SIZE << xa_to_node(entry)->shift) - 1; } +static inline void *xa_zero_to_null(void *entry) +{ + return xa_is_zero(entry) ? NULL : entry; +} + static void xas_shrink(struct xa_state *xas) { struct xarray *xa = xas->xa; @@ -451,8 +456,8 @@ static void xas_shrink(struct xa_state *xas) break; if (!xa_is_node(entry) && node->shift) break; - if (xa_is_zero(entry) && xa_zero_busy(xa)) - entry = NULL; + if (xa_zero_busy(xa)) + entry = xa_zero_to_null(entry); xas->xa_node = XAS_BOUNDS; RCU_INIT_POINTER(xa->xa_head, entry); @@ -1474,9 +1479,7 @@ void *xa_load(struct xarray *xa, unsigned long index) rcu_read_lock(); do { - entry = xas_load(&xas); - if (xa_is_zero(entry)) - entry = NULL; + entry = xa_zero_to_null(xas_load(&xas)); } while (xas_retry(&xas, entry)); rcu_read_unlock(); @@ -1486,11 +1489,9 @@ EXPORT_SYMBOL(xa_load); static void *xas_result(struct xa_state *xas, void *curr) { - if (xa_is_zero(curr)) - return NULL; if (xas_error(xas)) curr = xas->xa_node; - return curr; + return xa_zero_to_null(curr); } /** -- cgit v1.2.3 From 79ada2ae66153e285b3717ee4d4a56c8e517c1fa Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Tue, 12 Nov 2024 14:25:37 -0500 Subject: xarray: extract helper from __xa_{insert,cmpxchg} Reduce code duplication by extracting a static inline function. This function is identical to __xa_cmpxchg with the exception that it does not coerce zero entries to null on the return path. [tamird@gmail.com: fix __xa_erase()] Link: https://lkml.kernel.org/r/CAJ-ks9kN_qddZ3Ne5d=cADu5POC1rHd4rQcbVSD_spnZOrLLZg@mail.gmail.com Link: https://lkml.kernel.org/r/20241112-xarray-insert-cmpxchg-v1-2-dc2bdd8c4136@gmail.com Signed-off-by: Tamir Duberstein Cc: Alice Ryhl Cc: Andreas Hindborg Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- lib/xarray.c | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) (limited to 'lib/xarray.c') diff --git a/lib/xarray.c b/lib/xarray.c index 1b8305bffbff..5da8d18899a1 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -1491,7 +1491,7 @@ static void *xas_result(struct xa_state *xas, void *curr) { if (xas_error(xas)) curr = xas->xa_node; - return xa_zero_to_null(curr); + return curr; } /** @@ -1509,7 +1509,7 @@ static void *xas_result(struct xa_state *xas, void *curr) void *__xa_erase(struct xarray *xa, unsigned long index) { XA_STATE(xas, xa, index); - return xas_result(&xas, xas_store(&xas, NULL)); + return xas_result(&xas, xa_zero_to_null(xas_store(&xas, NULL))); } EXPORT_SYMBOL(__xa_erase); @@ -1568,7 +1568,7 @@ void *__xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) xas_clear_mark(&xas, XA_FREE_MARK); } while (__xas_nomem(&xas, gfp)); - return xas_result(&xas, curr); + return xas_result(&xas, xa_zero_to_null(curr)); } EXPORT_SYMBOL(__xa_store); @@ -1601,6 +1601,9 @@ void *xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) } EXPORT_SYMBOL(xa_store); +static inline void *__xa_cmpxchg_raw(struct xarray *xa, unsigned long index, + void *old, void *entry, gfp_t gfp); + /** * __xa_cmpxchg() - Store this entry in the XArray. * @xa: XArray. @@ -1619,6 +1622,13 @@ EXPORT_SYMBOL(xa_store); */ void *__xa_cmpxchg(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) +{ + return xa_zero_to_null(__xa_cmpxchg_raw(xa, index, old, entry, gfp)); +} +EXPORT_SYMBOL(__xa_cmpxchg); + +static inline void *__xa_cmpxchg_raw(struct xarray *xa, unsigned long index, + void *old, void *entry, gfp_t gfp) { XA_STATE(xas, xa, index); void *curr; @@ -1637,7 +1647,6 @@ void *__xa_cmpxchg(struct xarray *xa, unsigned long index, return xas_result(&xas, curr); } -EXPORT_SYMBOL(__xa_cmpxchg); /** * __xa_insert() - Store this entry in the XArray if no entry is present. @@ -1657,26 +1666,16 @@ EXPORT_SYMBOL(__xa_cmpxchg); */ int __xa_insert(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { - XA_STATE(xas, xa, index); void *curr; + int errno; - if (WARN_ON_ONCE(xa_is_advanced(entry))) - return -EINVAL; if (!entry) entry = XA_ZERO_ENTRY; - - do { - curr = xas_load(&xas); - if (!curr) { - xas_store(&xas, entry); - if (xa_track_free(xa)) - xas_clear_mark(&xas, XA_FREE_MARK); - } else { - xas_set_err(&xas, -EBUSY); - } - } while (__xas_nomem(&xas, gfp)); - - return xas_error(&xas); + curr = __xa_cmpxchg_raw(xa, index, NULL, entry, gfp); + errno = xa_err(curr); + if (errno) + return errno; + return (curr != NULL) ? -EBUSY : 0; } EXPORT_SYMBOL(__xa_insert); -- cgit v1.2.3 From 7e060df04f562b37bdc101fd06b16f013e2d989b Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 13 Dec 2024 20:25:19 +0800 Subject: Xarray: do not return sibling entries from xas_find_marked() Patch series "Fixes and cleanups to xarray", v5. This series contains some random fixes and cleanups to xarray. Patch 1-2 are fixes and patch 3-6 are cleanups. More details can be found in respective patches. This patch (of 5): Similar to issue fixed in commit cbc02854331ed ("XArray: Do not return sibling entries from xa_load()"), we may return sibling entries from xas_find_marked as following: Thread A: Thread B: xa_store_range(xa, entry, 6, 7, gfp); xa_set_mark(xa, 6, mark) XA_STATE(xas, xa, 6); xas_find_marked(&xas, 7, mark); offset = xas_find_chunk(xas, advance, mark); [offset is 6 which points to a valid entry] xa_store_range(xa, entry, 4, 7, gfp); entry = xa_entry(xa, node, 6); [entry is a sibling of 4] if (!xa_is_node(entry)) return entry; Skip sibling entry like xas_find() does to protect caller from seeing sibling entry from xas_find_marked() or caller may use sibling entry as a valid entry and crash the kernel. Besides, load_race() test is modified to catch mentioned issue and modified load_race() only passes after this fix is merged. Here is an example how this bug could be triggerred in tmpfs which enables large folio in mapping: Let's take a look at involved racer: 1. How pages could be created and dirtied in shmem file. write ksys_write vfs_write new_sync_write shmem_file_write_iter generic_perform_write shmem_write_begin shmem_get_folio shmem_allowable_huge_orders shmem_alloc_and_add_folios shmem_alloc_folio __folio_set_locked shmem_add_to_page_cache XA_STATE_ORDER(..., index, order) xax_store() shmem_write_end folio_mark_dirty() 2. How dirty pages could be deleted in shmem file. ioctl do_vfs_ioctl file_ioctl ioctl_preallocate vfs_fallocate shmem_fallocate shmem_truncate_range shmem_undo_range truncate_inode_folio filemap_remove_folio page_cache_delete xas_store(&xas, NULL); 3. How dirty pages could be lockless searched sync_file_range ksys_sync_file_range __filemap_fdatawrite_range filemap_fdatawrite_wbc do_writepages writeback_use_writepage writeback_iter writeback_get_folio filemap_get_folios_tag find_get_entry folio = xas_find_marked() folio_try_get(folio) Kernel will crash as following: 1.Create 2.Search 3.Delete /* write page 2,3 */ write ... shmem_write_begin XA_STATE_ORDER(xas, i_pages, index = 2, order = 1) xa_store(&xas, folio) shmem_write_end folio_mark_dirty() /* sync page 2 and page 3 */ sync_file_range ... find_get_entry folio = xas_find_marked() /* offset will be 2 */ offset = xas_find_chunk() /* delete page 2 and page 3 */ ioctl ... xas_store(&xas, NULL); /* write page 0-3 */ write ... shmem_write_begin XA_STATE_ORDER(xas, i_pages, index = 0, order = 2) xa_store(&xas, folio) shmem_write_end folio_mark_dirty(folio) /* get sibling entry from offset 2 */ entry = xa_entry(.., 2) /* use sibling entry as folio and crash kernel */ folio_try_get(folio) Link: https://lkml.kernel.org/r/20241213122523.12764-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20241213122523.12764-2-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Mattew Wilcox [English fixes] Signed-off-by: Andrew Morton --- lib/xarray.c | 2 ++ tools/testing/radix-tree/multiorder.c | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'lib/xarray.c') diff --git a/lib/xarray.c b/lib/xarray.c index 5da8d18899a1..19a9ea183c2d 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -1387,6 +1387,8 @@ void *xas_find_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark) entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); if (!entry && !(xa_track_free(xas->xa) && mark == XA_FREE_MARK)) continue; + if (xa_is_sibling(entry)) + continue; if (!xa_is_node(entry)) return entry; xas->xa_node = xa_to_node(entry); diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index cffaf2245d4f..eaff1b036989 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -227,6 +227,7 @@ static void *load_creator(void *ptr) unsigned long index = (3 << RADIX_TREE_MAP_SHIFT) - (1 << order); item_insert_order(tree, index, order); + xa_set_mark(tree, index, XA_MARK_1); item_delete_rcu(tree, index); } } @@ -242,8 +243,11 @@ static void *load_worker(void *ptr) rcu_register_thread(); while (!stop_iteration) { + unsigned long find_index = (2 << RADIX_TREE_MAP_SHIFT) + 1; struct item *item = xa_load(ptr, index); assert(!xa_is_internal(item)); + item = xa_find(ptr, &find_index, index, XA_MARK_1); + assert(!xa_is_internal(item)); } rcu_unregister_thread(); -- cgit v1.2.3 From c9ba5249ef8b080c6779d3da14a061f6d4d6d5fa Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 13 Dec 2024 20:25:20 +0800 Subject: Xarray: move forward index correctly in xas_pause() After xas_load(), xas->index could point to mid of found multi-index entry and xas->index's bits under node->shift maybe non-zero. The afterward xas_pause() will move forward xas->index with xa->node->shift with bits under node->shift un-masked and thus skip some index unexpectedly. Consider following case: Assume XA_CHUNK_SHIFT is 4. xa_store_range(xa, 16, 31, ...) xa_store(xa, 32, ...) XA_STATE(xas, xa, 17); xas_for_each(&xas,...) xas_load(&xas) /* xas->index = 17, xas->xa_offset = 1, xas->xa_node->xa_shift = 4 */ xas_pause() /* xas->index = 33, xas->xa_offset = 2, xas->xa_node->xa_shift = 4 */ As we can see, index of 32 is skipped unexpectedly. Fix this by mask bit under node->xa_shift when move forward index in xas_pause(). For now, this will not cause serious problems. Only minor problem like cachestat return less number of page status could happen. Link: https://lkml.kernel.org/r/20241213122523.12764-3-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Mattew Wilcox Signed-off-by: Andrew Morton --- lib/test_xarray.c | 35 +++++++++++++++++++++++++++++++++++ lib/xarray.c | 1 + 2 files changed, 36 insertions(+) (limited to 'lib/xarray.c') diff --git a/lib/test_xarray.c b/lib/test_xarray.c index b6cac747ec46..eab5971d0a48 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -1511,6 +1511,41 @@ static noinline void check_pause(struct kunit *test) XA_BUG_ON(xa, count != order_limit); xa_destroy(xa); + + index = 0; + for (order = XA_CHUNK_SHIFT; order > 0; order--) { + XA_BUG_ON(xa, xa_store_order(xa, index, order, + xa_mk_index(index), GFP_KERNEL)); + index += 1UL << order; + } + + index = 0; + count = 0; + xas_set(&xas, 0); + rcu_read_lock(); + xas_for_each(&xas, entry, ULONG_MAX) { + XA_BUG_ON(xa, entry != xa_mk_index(index)); + index += 1UL << (XA_CHUNK_SHIFT - count); + count++; + } + rcu_read_unlock(); + XA_BUG_ON(xa, count != XA_CHUNK_SHIFT); + + index = 0; + count = 0; + xas_set(&xas, XA_CHUNK_SIZE / 2 + 1); + rcu_read_lock(); + xas_for_each(&xas, entry, ULONG_MAX) { + XA_BUG_ON(xa, entry != xa_mk_index(index)); + index += 1UL << (XA_CHUNK_SHIFT - count); + count++; + xas_pause(&xas); + } + rcu_read_unlock(); + XA_BUG_ON(xa, count != XA_CHUNK_SHIFT); + + xa_destroy(xa); + } static noinline void check_move_tiny(struct kunit *test) diff --git a/lib/xarray.c b/lib/xarray.c index 19a9ea183c2d..091e2c927915 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -1152,6 +1152,7 @@ void xas_pause(struct xa_state *xas) if (!xa_is_sibling(xa_entry(xas->xa, node, offset))) break; } + xas->xa_index &= ~0UL << node->shift; xas->xa_index += (offset - xas->xa_offset) << node->shift; if (xas->xa_index == 0) xas->xa_node = XAS_BOUNDS; -- cgit v1.2.3 From 97db889b961ef3f849813de34bd3ea5715813ed0 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 13 Dec 2024 20:25:21 +0800 Subject: Xarray: distinguish large entries correctly in xas_split_alloc() We don't support large entries which expand two more level xa_node in split. For case "xas->xa_shift + 2 * XA_CHUNK_SHIFT == order", we also need two level of xa_node to expand. Distinguish entry as large entry in case "xas->xa_shift + 2 * XA_CHUNK_SHIFT == order". As max order of folio in pagecache (MAX_PAGECACHE_ORDER) is <= (XA_CHUNK_SHIFT * 2 - 1), this change is more likely a cleanup... Link: https://lkml.kernel.org/r/20241213122523.12764-4-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Mattew Wilcox Signed-off-by: Andrew Morton --- lib/xarray.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/xarray.c') diff --git a/lib/xarray.c b/lib/xarray.c index 091e2c927915..ecd2e4f71aa8 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -1027,7 +1027,7 @@ void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order, unsigned int mask = xas->xa_sibs; /* XXX: no support for splitting really large entries yet */ - if (WARN_ON(xas->xa_shift + 2 * XA_CHUNK_SHIFT < order)) + if (WARN_ON(xas->xa_shift + 2 * XA_CHUNK_SHIFT <= order)) goto nomem; if (xas->xa_shift + XA_CHUNK_SHIFT > order) return; -- cgit v1.2.3 From 1988b318b32d3a0260e05bf6643990230df35843 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 13 Dec 2024 20:25:22 +0800 Subject: Xarray: remove repeat check in xas_squash_marks() Caller of xas_squash_marks() has ensured xas->xa_sibs is non-zero. Just remove repeat check of xas->xa_sibs in xas_squash_marks(). Link: https://lkml.kernel.org/r/20241213122523.12764-5-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Mattew Wilcox Signed-off-by: Andrew Morton --- lib/xarray.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'lib/xarray.c') diff --git a/lib/xarray.c b/lib/xarray.c index ecd2e4f71aa8..2386423865a0 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -128,9 +128,6 @@ static void xas_squash_marks(const struct xa_state *xas) unsigned int mark = 0; unsigned int limit = xas->xa_offset + xas->xa_sibs + 1; - if (!xas->xa_sibs) - return; - do { unsigned long *marks = xas->xa_node->marks[mark]; if (find_next_bit(marks, limit, xas->xa_offset + 1) == limit) -- cgit v1.2.3 From 13fd5cf37456620b57106dbfa89c5aad5f195343 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 13 Dec 2024 20:25:23 +0800 Subject: Xarray: use xa_mark_t in xas_squash_marks() to keep code consistent Besides xas_squash_marks(), all functions use xa_mark_t type to iterate all possible marks. Use xa_mark_t in xas_squash_marks() to keep code consistent. Link: https://lkml.kernel.org/r/20241213122523.12764-6-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Mattew Wilcox Signed-off-by: Andrew Morton --- lib/xarray.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'lib/xarray.c') diff --git a/lib/xarray.c b/lib/xarray.c index 2386423865a0..116e9286c64e 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -125,16 +125,20 @@ static inline void node_mark_all(struct xa_node *node, xa_mark_t mark) */ static void xas_squash_marks(const struct xa_state *xas) { - unsigned int mark = 0; + xa_mark_t mark = 0; unsigned int limit = xas->xa_offset + xas->xa_sibs + 1; - do { - unsigned long *marks = xas->xa_node->marks[mark]; - if (find_next_bit(marks, limit, xas->xa_offset + 1) == limit) - continue; - __set_bit(xas->xa_offset, marks); - bitmap_clear(marks, xas->xa_offset + 1, xas->xa_sibs); - } while (mark++ != (__force unsigned)XA_MARK_MAX); + for (;;) { + unsigned long *marks = node_marks(xas->xa_node, mark); + + if (find_next_bit(marks, limit, xas->xa_offset + 1) != limit) { + __set_bit(xas->xa_offset, marks); + bitmap_clear(marks, xas->xa_offset + 1, xas->xa_sibs); + } + if (mark == XA_MARK_MAX) + break; + mark_inc(mark); + } } /* extracts the offset within this node from the index */ -- cgit v1.2.3