diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/filemap.c | 157 | ||||
-rw-r--r-- | mm/internal.h | 6 | ||||
-rw-r--r-- | mm/memcontrol.c | 18 | ||||
-rw-r--r-- | mm/slab.c | 34 | ||||
-rw-r--r-- | mm/slub.c | 13 | ||||
-rw-r--r-- | mm/vmalloc.c | 1 |
6 files changed, 191 insertions, 38 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 0e20a8d6dd93..b794943bce76 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -42,6 +42,8 @@ #include <linux/ramfs.h> #include <linux/page_idle.h> #include <linux/migrate.h> +#include <linux/pipe_fs_i.h> +#include <linux/splice.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include "internal.h" @@ -2440,21 +2442,19 @@ static int filemap_read_folio(struct file *file, filler_t filler, } static bool filemap_range_uptodate(struct address_space *mapping, - loff_t pos, struct iov_iter *iter, struct folio *folio) + loff_t pos, size_t count, struct folio *folio, + bool need_uptodate) { - int count; - if (folio_test_uptodate(folio)) return true; /* pipes can't handle partially uptodate pages */ - if (iov_iter_is_pipe(iter)) + if (need_uptodate) return false; if (!mapping->a_ops->is_partially_uptodate) return false; if (mapping->host->i_blkbits >= folio_shift(folio)) return false; - count = iter->count; if (folio_pos(folio) > pos) { count -= folio_pos(folio) - pos; pos = 0; @@ -2466,8 +2466,8 @@ static bool filemap_range_uptodate(struct address_space *mapping, } static int filemap_update_page(struct kiocb *iocb, - struct address_space *mapping, struct iov_iter *iter, - struct folio *folio) + struct address_space *mapping, size_t count, + struct folio *folio, bool need_uptodate) { int error; @@ -2501,7 +2501,8 @@ static int filemap_update_page(struct kiocb *iocb, goto unlock; error = 0; - if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, folio)) + if (filemap_range_uptodate(mapping, iocb->ki_pos, count, folio, + need_uptodate)) goto unlock; error = -EAGAIN; @@ -2577,8 +2578,8 @@ static int filemap_readahead(struct kiocb *iocb, struct file *file, return 0; } -static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter, - struct folio_batch *fbatch) +static int filemap_get_pages(struct kiocb *iocb, size_t count, + struct folio_batch *fbatch, bool need_uptodate) { struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; @@ -2589,7 +2590,7 @@ static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter, int err = 0; /* "last_index" is the index of the page beyond the end of the read */ - last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE); + last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE); retry: if (fatal_signal_pending(current)) return -EINTR; @@ -2622,7 +2623,8 @@ retry: if ((iocb->ki_flags & IOCB_WAITQ) && folio_batch_count(fbatch) > 1) iocb->ki_flags |= IOCB_NOWAIT; - err = filemap_update_page(iocb, mapping, iter, folio); + err = filemap_update_page(iocb, mapping, count, folio, + need_uptodate); if (err) goto err; } @@ -2692,7 +2694,8 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, if (unlikely(iocb->ki_pos >= i_size_read(inode))) break; - error = filemap_get_pages(iocb, iter, &fbatch); + error = filemap_get_pages(iocb, iter->count, &fbatch, + iov_iter_is_pipe(iter)); if (error < 0) break; @@ -2842,6 +2845,134 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) } EXPORT_SYMBOL(generic_file_read_iter); +/* + * Splice subpages from a folio into a pipe. + */ +size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, + struct folio *folio, loff_t fpos, size_t size) +{ + struct page *page; + size_t spliced = 0, offset = offset_in_folio(folio, fpos); + + page = folio_page(folio, offset / PAGE_SIZE); + size = min(size, folio_size(folio) - offset); + offset %= PAGE_SIZE; + + while (spliced < size && + !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { + struct pipe_buffer *buf = pipe_head_buf(pipe); + size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced); + + *buf = (struct pipe_buffer) { + .ops = &page_cache_pipe_buf_ops, + .page = page, + .offset = offset, + .len = part, + }; + folio_get(folio); + pipe->head++; + page++; + spliced += part; + offset = 0; + } + + return spliced; +} + +/* + * Splice folios from the pagecache of a buffered (ie. non-O_DIRECT) file into + * a pipe. + */ +ssize_t filemap_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct folio_batch fbatch; + struct kiocb iocb; + size_t total_spliced = 0, used, npages; + loff_t isize, end_offset; + bool writably_mapped; + int i, error = 0; + + init_sync_kiocb(&iocb, in); + iocb.ki_pos = *ppos; + + /* Work out how much data we can actually add into the pipe */ + used = pipe_occupancy(pipe->head, pipe->tail); + npages = max_t(ssize_t, pipe->max_usage - used, 0); + len = min_t(size_t, len, npages * PAGE_SIZE); + + folio_batch_init(&fbatch); + + do { + cond_resched(); + + if (*ppos >= i_size_read(file_inode(in))) + break; + + iocb.ki_pos = *ppos; + error = filemap_get_pages(&iocb, len, &fbatch, true); + if (error < 0) + break; + + /* + * i_size must be checked after we know the pages are Uptodate. + * + * Checking i_size after the check allows us to calculate + * the correct value for "nr", which means the zero-filled + * part of the page is not copied back to userspace (unless + * another truncate extends the file - this is desired though). + */ + isize = i_size_read(file_inode(in)); + if (unlikely(*ppos >= isize)) + break; + end_offset = min_t(loff_t, isize, *ppos + len); + + /* + * Once we start copying data, we don't want to be touching any + * cachelines that might be contended: + */ + writably_mapped = mapping_writably_mapped(in->f_mapping); + + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + size_t n; + + if (folio_pos(folio) >= end_offset) + goto out; + folio_mark_accessed(folio); + + /* + * If users can be writing to this folio using arbitrary + * virtual addresses, take care of potential aliasing + * before reading the folio on the kernel side. + */ + if (writably_mapped) + flush_dcache_folio(folio); + + n = min_t(loff_t, len, isize - *ppos); + n = splice_folio_into_pipe(pipe, folio, *ppos, n); + if (!n) + goto out; + len -= n; + total_spliced += n; + *ppos += n; + in->f_ra.prev_pos = *ppos; + if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + goto out; + } + + folio_batch_release(&fbatch); + } while (len); + +out: + folio_batch_release(&fbatch); + file_accessed(in); + + return total_spliced ? total_spliced : error; +} +EXPORT_SYMBOL(filemap_splice_read); + static inline loff_t folio_seek_hole_data(struct xa_state *xas, struct address_space *mapping, struct folio *folio, loff_t start, loff_t end, bool seek_data) diff --git a/mm/internal.h b/mm/internal.h index bcf75a8b032d..6d4ca98f3844 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -795,6 +795,12 @@ struct migration_target_control { }; /* + * mm/filemap.c + */ +size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, + struct folio *folio, loff_t fpos, size_t size); + +/* * mm/vmalloc.c */ #ifdef CONFIG_MMU diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 73afff8062f9..49f40730e711 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -88,6 +88,9 @@ static bool cgroup_memory_nosocket __ro_after_init; /* Kernel memory accounting disabled? */ static bool cgroup_memory_nokmem __ro_after_init; +/* BPF memory accounting disabled? */ +static bool cgroup_memory_nobpf __ro_after_init; + #ifdef CONFIG_CGROUP_WRITEBACK static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); #endif @@ -347,6 +350,9 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg, */ DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); EXPORT_SYMBOL(memcg_kmem_enabled_key); + +DEFINE_STATIC_KEY_FALSE(memcg_bpf_enabled_key); +EXPORT_SYMBOL(memcg_bpf_enabled_key); #endif /** @@ -5357,6 +5363,11 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) static_branch_inc(&memcg_sockets_enabled_key); +#if defined(CONFIG_MEMCG_KMEM) + if (!cgroup_memory_nobpf) + static_branch_inc(&memcg_bpf_enabled_key); +#endif + return &memcg->css; } @@ -5441,6 +5452,11 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active) static_branch_dec(&memcg_sockets_enabled_key); +#if defined(CONFIG_MEMCG_KMEM) + if (!cgroup_memory_nobpf) + static_branch_dec(&memcg_bpf_enabled_key); +#endif + vmpressure_cleanup(&memcg->vmpressure); cancel_work_sync(&memcg->high_work); mem_cgroup_remove_from_trees(memcg); @@ -7269,6 +7285,8 @@ static int __init cgroup_memory(char *s) cgroup_memory_nosocket = true; if (!strcmp(token, "nokmem")) cgroup_memory_nokmem = true; + if (!strcmp(token, "nobpf")) + cgroup_memory_nobpf = true; } return 1; } diff --git a/mm/slab.c b/mm/slab.c index 29300fc1289a..74ece29e3a7e 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -220,7 +220,6 @@ static inline void fixup_objfreelist_debug(struct kmem_cache *cachep, static inline void fixup_slab_list(struct kmem_cache *cachep, struct kmem_cache_node *n, struct slab *slab, void **list); -static int slab_early_init = 1; #define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node)) @@ -1249,8 +1248,6 @@ void __init kmem_cache_init(void) slab_state = PARTIAL_NODE; setup_kmalloc_cache_index_table(); - slab_early_init = 0; - /* 5) Replace the bootstrap kmem_cache_node */ { int nid; @@ -1389,7 +1386,7 @@ static void kmem_freepages(struct kmem_cache *cachep, struct slab *slab) BUG_ON(!folio_test_slab(folio)); __slab_clear_pfmemalloc(slab); - page_mapcount_reset(folio_page(folio, 0)); + page_mapcount_reset(&folio->page); folio->mapping = NULL; /* Make the mapping reset visible before clearing the flag */ smp_wmb(); @@ -1398,7 +1395,7 @@ static void kmem_freepages(struct kmem_cache *cachep, struct slab *slab) if (current->reclaim_state) current->reclaim_state->reclaimed_slab += 1 << order; unaccount_slab(slab, order, cachep); - __free_pages(folio_page(folio, 0), order); + __free_pages(&folio->page, order); } static void kmem_rcu_free(struct rcu_head *head) @@ -1413,13 +1410,10 @@ static void kmem_rcu_free(struct rcu_head *head) } #if DEBUG -static bool is_debug_pagealloc_cache(struct kmem_cache *cachep) +static inline bool is_debug_pagealloc_cache(struct kmem_cache *cachep) { - if (debug_pagealloc_enabled_static() && OFF_SLAB(cachep) && - (cachep->size % PAGE_SIZE) == 0) - return true; - - return false; + return debug_pagealloc_enabled_static() && OFF_SLAB(cachep) && + ((cachep->size % PAGE_SIZE) == 0); } #ifdef CONFIG_DEBUG_PAGEALLOC @@ -3479,14 +3473,15 @@ cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags, int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p) { - size_t i; struct obj_cgroup *objcg = NULL; + unsigned long irqflags; + size_t i; s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags); if (!s) return 0; - local_irq_disable(); + local_irq_save(irqflags); for (i = 0; i < size; i++) { void *objp = kfence_alloc(s, s->object_size, flags) ?: __do_cache_alloc(s, flags, NUMA_NO_NODE); @@ -3495,7 +3490,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, goto error; p[i] = objp; } - local_irq_enable(); + local_irq_restore(irqflags); cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_); @@ -3508,7 +3503,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, /* FIXME: Trace call missing. Christoph would like a bulk variant */ return size; error: - local_irq_enable(); + local_irq_restore(irqflags); cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_); slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size); kmem_cache_free_bulk(s, i, p); @@ -3610,8 +3605,9 @@ EXPORT_SYMBOL(kmem_cache_free); void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) { + unsigned long flags; - local_irq_disable(); + local_irq_save(flags); for (int i = 0; i < size; i++) { void *objp = p[i]; struct kmem_cache *s; @@ -3621,9 +3617,9 @@ void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) /* called via kfree_bulk */ if (!folio_test_slab(folio)) { - local_irq_enable(); + local_irq_restore(flags); free_large_kmalloc(folio, objp); - local_irq_disable(); + local_irq_save(flags); continue; } s = folio_slab(folio)->slab_cache; @@ -3640,7 +3636,7 @@ void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) __cache_free(s, objp, _RET_IP_); } - local_irq_enable(); + local_irq_restore(flags); /* FIXME: add tracing */ } diff --git a/mm/slub.c b/mm/slub.c index 13459c69095a..1013834fb7bb 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2066,7 +2066,7 @@ static void __free_slab(struct kmem_cache *s, struct slab *slab) if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; unaccount_slab(slab, order, s); - __free_pages(folio_page(folio, 0), order); + __free_pages(&folio->page, order); } static void rcu_free_slab(struct rcu_head *h) @@ -3913,6 +3913,7 @@ static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p, struct obj_cgroup *objcg) { struct kmem_cache_cpu *c; + unsigned long irqflags; int i; /* @@ -3921,7 +3922,7 @@ static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, * handlers invoking normal fastpath. */ c = slub_get_cpu_ptr(s->cpu_slab); - local_lock_irq(&s->cpu_slab->lock); + local_lock_irqsave(&s->cpu_slab->lock, irqflags); for (i = 0; i < size; i++) { void *object = kfence_alloc(s, s->object_size, flags); @@ -3942,7 +3943,7 @@ static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, */ c->tid = next_tid(c->tid); - local_unlock_irq(&s->cpu_slab->lock); + local_unlock_irqrestore(&s->cpu_slab->lock, irqflags); /* * Invoking slow path likely have side-effect @@ -3956,7 +3957,7 @@ static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, c = this_cpu_ptr(s->cpu_slab); maybe_wipe_obj_freeptr(s, p[i]); - local_lock_irq(&s->cpu_slab->lock); + local_lock_irqsave(&s->cpu_slab->lock, irqflags); continue; /* goto for-loop */ } @@ -3965,7 +3966,7 @@ static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, maybe_wipe_obj_freeptr(s, p[i]); } c->tid = next_tid(c->tid); - local_unlock_irq(&s->cpu_slab->lock); + local_unlock_irqrestore(&s->cpu_slab->lock, irqflags); slub_put_cpu_ptr(s->cpu_slab); return i; @@ -6449,7 +6450,7 @@ static void debugfs_slab_add(struct kmem_cache *s) void debugfs_slab_release(struct kmem_cache *s) { - debugfs_remove_recursive(debugfs_lookup(s->name, slab_debugfs_root)); + debugfs_lookup_and_remove(s->name, slab_debugfs_root); } static int __init slab_debugfs_init(void) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ca71de7c9d77..61f5bec0f2b6 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -656,6 +656,7 @@ int is_vmalloc_or_module_addr(const void *x) #endif return is_vmalloc_addr(x); } +EXPORT_SYMBOL_GPL(is_vmalloc_or_module_addr); /* * Walk a vmap address to the struct page it maps. Huge vmap mappings will |