From 8c48eea3adf3119e0a3fc57bd31f6966f26ee784 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 12 Apr 2023 21:26:04 -0700 Subject: page_pool: allow caching from safely localized NAPI Recent patches to mlx5 mentioned a regression when moving from driver local page pool to only using the generic page pool code. Page pool has two recycling paths (1) direct one, which runs in safe NAPI context (basically consumer context, so producing can be lockless); and (2) via a ptr_ring, which takes a spin lock because the freeing can happen from any CPU; producer and consumer may run concurrently. Since the page pool code was added, Eric introduced a revised version of deferred skb freeing. TCP skbs are now usually returned to the CPU which allocated them, and freed in softirq context. This places the freeing (producing of pages back to the pool) enticingly close to the allocation (consumer). If we can prove that we're freeing in the same softirq context in which the consumer NAPI will run - lockless use of the cache is perfectly fine, no need for the lock. Let drivers link the page pool to a NAPI instance. If the NAPI instance is scheduled on the same CPU on which we're freeing - place the pages in the direct cache. With that and patched bnxt (XDP enabled to engage the page pool, sigh, bnxt really needs page pool work :() I see a 2.6% perf boost with a TCP stream test (app on a different physical core than softirq). The CPU use of relevant functions decreases as expected: page_pool_refill_alloc_cache 1.17% -> 0% _raw_spin_lock 2.41% -> 0.98% Only consider lockless path to be safe when NAPI is scheduled - in practice this should cover majority if not all of steady state workloads. It's usually the NAPI kicking in that causes the skb flush. The main case we'll miss out on is when application runs on the same CPU as NAPI. In that case we don't use the deferred skb free path. Reviewed-by: Tariq Toukan Acked-by: Jesper Dangaard Brouer Tested-by: Dragos Tatulea Signed-off-by: Jakub Kicinski --- net/core/page_pool.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'net/core/page_pool.c') diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 193c18799865..2f6bf422ed30 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -19,6 +19,7 @@ #include /* for put_page() */ #include #include +#include #include @@ -874,9 +875,11 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid) } EXPORT_SYMBOL(page_pool_update_nid); -bool page_pool_return_skb_page(struct page *page) +bool page_pool_return_skb_page(struct page *page, bool napi_safe) { + struct napi_struct *napi; struct page_pool *pp; + bool allow_direct; page = compound_head(page); @@ -892,12 +895,20 @@ bool page_pool_return_skb_page(struct page *page) pp = page->pp; + /* Allow direct recycle if we have reasons to believe that we are + * in the same context as the consumer would run, so there's + * no possible race. + */ + napi = pp->p.napi; + allow_direct = napi_safe && napi && + READ_ONCE(napi->list_owner) == smp_processor_id(); + /* Driver set this to memory recycling info. Reset it on recycle. * This will *not* work for NIC using a split-page memory model. * The page will be returned to the pool here regardless of the * 'flipped' fragment being in use or not. */ - page_pool_put_full_page(pp, page, false); + page_pool_put_full_page(pp, page, allow_direct); return true; } -- cgit v1.2.3 From 8e4c62c7d980eaf0f64c1c0ef0c80f5685af0fb6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 17 Apr 2023 08:28:05 -0700 Subject: page_pool: add DMA_ATTR_WEAK_ORDERING on all mappings Commit c519fe9a4f0d ("bnxt: add dma mapping attributes") added DMA_ATTR_WEAK_ORDERING to DMA attrs on bnxt. It has since spread to a few more drivers (possibly as a copy'n'paste). DMA_ATTR_WEAK_ORDERING only seems to matter on Sparc and PowerPC/cell, the rarity of these platforms is likely why we never bothered adding the attribute in the page pool, even though it should be safe to add. To make the page pool migration in drivers which set this flag less of a risk (of regressing the precious sparc database workloads or whatever needed this) let's add DMA_ATTR_WEAK_ORDERING on all page pool DMA mappings. We could make this a driver opt-in but frankly I don't think it's worth complicating the API. I can't think of a reason why device accesses to packet memory would have to be ordered. Acked-by: Ilias Apalodimas Acked-by: Somnath Kotur Reviewed-by: Alexander Lobakin Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/r/20230417152805.331865-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/page_pool.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/core/page_pool.c') diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 2f6bf422ed30..97f20f7ff4fc 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -316,7 +316,8 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page) */ dma = dma_map_page_attrs(pool->p.dev, page, 0, (PAGE_SIZE << pool->p.order), - pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC); + pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | + DMA_ATTR_WEAK_ORDERING); if (dma_mapping_error(pool->p.dev, dma)) return false; @@ -484,7 +485,7 @@ void page_pool_release_page(struct page_pool *pool, struct page *page) /* When page is unmapped, it cannot be returned to our pool */ dma_unmap_page_attrs(pool->p.dev, dma, PAGE_SIZE << pool->p.order, pool->p.dma_dir, - DMA_ATTR_SKIP_CPU_SYNC); + DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); page_pool_set_dma_addr(page, 0); skip_dma_unmap: page_pool_clear_pp_info(page); -- cgit v1.2.3 From dd64b232deb8d48812a2ea739d1fedaeaffb59ed Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 19 Apr 2023 11:20:06 -0700 Subject: page_pool: unlink from napi during destroy Jesper points out that we must prevent recycling into cache after page_pool_destroy() is called, because page_pool_destroy() is not synchronized with recycling (some pages may still be outstanding when destroy() gets called). I assumed this will not happen because NAPI can't be scheduled if its page pool is being destroyed. But I missed the fact that NAPI may get reused. For instance when user changes ring configuration driver may allocate a new page pool, stop NAPI, swap, start NAPI, and then destroy the old pool. The NAPI is running so old page pool will think it can recycle to the cache, but the consumer at that point is the destroy() path, not NAPI. To avoid extra synchronization let the drivers do "unlinking" during the "swap" stage while NAPI is indeed disabled. Fixes: 8c48eea3adf3 ("page_pool: allow caching from safely localized NAPI") Reported-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/all/e8df2654-6a5b-3c92-489d-2fe5e444135f@redhat.com/ Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/r/20230419182006.719923-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/page_pool.h | 5 +++++ net/core/page_pool.c | 18 +++++++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) (limited to 'net/core/page_pool.c') diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 91b808dade82..c8ec2f34722b 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -247,6 +247,7 @@ struct page_pool *page_pool_create(const struct page_pool_params *params); struct xdp_mem_info; #ifdef CONFIG_PAGE_POOL +void page_pool_unlink_napi(struct page_pool *pool); void page_pool_destroy(struct page_pool *pool); void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), struct xdp_mem_info *mem); @@ -254,6 +255,10 @@ void page_pool_release_page(struct page_pool *pool, struct page *page); void page_pool_put_page_bulk(struct page_pool *pool, void **data, int count); #else +static inline void page_pool_unlink_napi(struct page_pool *pool) +{ +} + static inline void page_pool_destroy(struct page_pool *pool) { } diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 97f20f7ff4fc..e212e9d7edcb 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -839,6 +839,21 @@ void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), pool->xdp_mem_id = mem->id; } +void page_pool_unlink_napi(struct page_pool *pool) +{ + if (!pool->p.napi) + return; + + /* To avoid races with recycling and additional barriers make sure + * pool and NAPI are unlinked when NAPI is disabled. + */ + WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state) || + READ_ONCE(pool->p.napi->list_owner) != -1); + + WRITE_ONCE(pool->p.napi, NULL); +} +EXPORT_SYMBOL(page_pool_unlink_napi); + void page_pool_destroy(struct page_pool *pool) { if (!pool) @@ -847,6 +862,7 @@ void page_pool_destroy(struct page_pool *pool) if (!page_pool_put(pool)) return; + page_pool_unlink_napi(pool); page_pool_free_frag(pool); if (!page_pool_release(pool)) @@ -900,7 +916,7 @@ bool page_pool_return_skb_page(struct page *page, bool napi_safe) * in the same context as the consumer would run, so there's * no possible race. */ - napi = pp->p.napi; + napi = READ_ONCE(pp->p.napi); allow_direct = napi_safe && napi && READ_ONCE(napi->list_owner) == smp_processor_id(); -- cgit v1.2.3 From 368d3cb406cdd074d1df2ad9ec06d1bfcb664882 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Mon, 22 May 2023 11:17:14 +0800 Subject: page_pool: fix inconsistency for page_pool_ring_[un]lock() page_pool_ring_[un]lock() use in_softirq() to decide which spin lock variant to use, and when they are called in the context with in_softirq() being false, spin_lock_bh() is called in page_pool_ring_lock() while spin_unlock() is called in page_pool_ring_unlock(), because spin_lock_bh() has disabled the softirq in page_pool_ring_lock(), which causes inconsistency for spin lock pair calling. This patch fixes it by returning in_softirq state from page_pool_producer_lock(), and use it to decide which spin lock variant to use in page_pool_producer_unlock(). As pool->ring has both producer and consumer lock, so rename it to page_pool_producer_[un]lock() to reflect the actual usage. Also move them to page_pool.c as they are only used there, and remove the 'inline' as the compiler may have better idea to do inlining or not. Fixes: 7886244736a4 ("net: page_pool: Add bulk support for ptr_ring") Signed-off-by: Yunsheng Lin Acked-by: Jesper Dangaard Brouer Acked-by: Ilias Apalodimas Link: https://lore.kernel.org/r/20230522031714.5089-1-linyunsheng@huawei.com Signed-off-by: Jakub Kicinski --- include/net/page_pool.h | 18 ------------------ net/core/page_pool.c | 28 ++++++++++++++++++++++++++-- 2 files changed, 26 insertions(+), 20 deletions(-) (limited to 'net/core/page_pool.c') diff --git a/include/net/page_pool.h b/include/net/page_pool.h index c8ec2f34722b..126f9e294389 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -399,22 +399,4 @@ static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) page_pool_update_nid(pool, new_nid); } -static inline void page_pool_ring_lock(struct page_pool *pool) - __acquires(&pool->ring.producer_lock) -{ - if (in_softirq()) - spin_lock(&pool->ring.producer_lock); - else - spin_lock_bh(&pool->ring.producer_lock); -} - -static inline void page_pool_ring_unlock(struct page_pool *pool) - __releases(&pool->ring.producer_lock) -{ - if (in_softirq()) - spin_unlock(&pool->ring.producer_lock); - else - spin_unlock_bh(&pool->ring.producer_lock); -} - #endif /* _NET_PAGE_POOL_H */ diff --git a/net/core/page_pool.c b/net/core/page_pool.c index e212e9d7edcb..a3e12a61d456 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -134,6 +134,29 @@ EXPORT_SYMBOL(page_pool_ethtool_stats_get); #define recycle_stat_add(pool, __stat, val) #endif +static bool page_pool_producer_lock(struct page_pool *pool) + __acquires(&pool->ring.producer_lock) +{ + bool in_softirq = in_softirq(); + + if (in_softirq) + spin_lock(&pool->ring.producer_lock); + else + spin_lock_bh(&pool->ring.producer_lock); + + return in_softirq; +} + +static void page_pool_producer_unlock(struct page_pool *pool, + bool in_softirq) + __releases(&pool->ring.producer_lock) +{ + if (in_softirq) + spin_unlock(&pool->ring.producer_lock); + else + spin_unlock_bh(&pool->ring.producer_lock); +} + static int page_pool_init(struct page_pool *pool, const struct page_pool_params *params) { @@ -617,6 +640,7 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, int count) { int i, bulk_len = 0; + bool in_softirq; for (i = 0; i < count; i++) { struct page *page = virt_to_head_page(data[i]); @@ -635,7 +659,7 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, return; /* Bulk producer into ptr_ring page_pool cache */ - page_pool_ring_lock(pool); + in_softirq = page_pool_producer_lock(pool); for (i = 0; i < bulk_len; i++) { if (__ptr_ring_produce(&pool->ring, data[i])) { /* ring full */ @@ -644,7 +668,7 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, } } recycle_stat_add(pool, ring, i); - page_pool_ring_unlock(pool); + page_pool_producer_unlock(pool, in_softirq); /* Hopefully all pages was return into ptr_ring */ if (likely(i == bulk_len)) -- cgit v1.2.3