summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c157
-rw-r--r--mm/internal.h6
-rw-r--r--mm/memcontrol.c18
-rw-r--r--mm/slab.c34
-rw-r--r--mm/slub.c13
-rw-r--r--mm/vmalloc.c1
6 files changed, 191 insertions, 38 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 0e20a8d6dd93..b794943bce76 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -42,6 +42,8 @@
#include <linux/ramfs.h>
#include <linux/page_idle.h>
#include <linux/migrate.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/splice.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -2440,21 +2442,19 @@ static int filemap_read_folio(struct file *file, filler_t filler,
}
static bool filemap_range_uptodate(struct address_space *mapping,
- loff_t pos, struct iov_iter *iter, struct folio *folio)
+ loff_t pos, size_t count, struct folio *folio,
+ bool need_uptodate)
{
- int count;
-
if (folio_test_uptodate(folio))
return true;
/* pipes can't handle partially uptodate pages */
- if (iov_iter_is_pipe(iter))
+ if (need_uptodate)
return false;
if (!mapping->a_ops->is_partially_uptodate)
return false;
if (mapping->host->i_blkbits >= folio_shift(folio))
return false;
- count = iter->count;
if (folio_pos(folio) > pos) {
count -= folio_pos(folio) - pos;
pos = 0;
@@ -2466,8 +2466,8 @@ static bool filemap_range_uptodate(struct address_space *mapping,
}
static int filemap_update_page(struct kiocb *iocb,
- struct address_space *mapping, struct iov_iter *iter,
- struct folio *folio)
+ struct address_space *mapping, size_t count,
+ struct folio *folio, bool need_uptodate)
{
int error;
@@ -2501,7 +2501,8 @@ static int filemap_update_page(struct kiocb *iocb,
goto unlock;
error = 0;
- if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, folio))
+ if (filemap_range_uptodate(mapping, iocb->ki_pos, count, folio,
+ need_uptodate))
goto unlock;
error = -EAGAIN;
@@ -2577,8 +2578,8 @@ static int filemap_readahead(struct kiocb *iocb, struct file *file,
return 0;
}
-static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
- struct folio_batch *fbatch)
+static int filemap_get_pages(struct kiocb *iocb, size_t count,
+ struct folio_batch *fbatch, bool need_uptodate)
{
struct file *filp = iocb->ki_filp;
struct address_space *mapping = filp->f_mapping;
@@ -2589,7 +2590,7 @@ static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
int err = 0;
/* "last_index" is the index of the page beyond the end of the read */
- last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE);
+ last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE);
retry:
if (fatal_signal_pending(current))
return -EINTR;
@@ -2622,7 +2623,8 @@ retry:
if ((iocb->ki_flags & IOCB_WAITQ) &&
folio_batch_count(fbatch) > 1)
iocb->ki_flags |= IOCB_NOWAIT;
- err = filemap_update_page(iocb, mapping, iter, folio);
+ err = filemap_update_page(iocb, mapping, count, folio,
+ need_uptodate);
if (err)
goto err;
}
@@ -2692,7 +2694,8 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
if (unlikely(iocb->ki_pos >= i_size_read(inode)))
break;
- error = filemap_get_pages(iocb, iter, &fbatch);
+ error = filemap_get_pages(iocb, iter->count, &fbatch,
+ iov_iter_is_pipe(iter));
if (error < 0)
break;
@@ -2842,6 +2845,134 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
}
EXPORT_SYMBOL(generic_file_read_iter);
+/*
+ * Splice subpages from a folio into a pipe.
+ */
+size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
+ struct folio *folio, loff_t fpos, size_t size)
+{
+ struct page *page;
+ size_t spliced = 0, offset = offset_in_folio(folio, fpos);
+
+ page = folio_page(folio, offset / PAGE_SIZE);
+ size = min(size, folio_size(folio) - offset);
+ offset %= PAGE_SIZE;
+
+ while (spliced < size &&
+ !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
+ struct pipe_buffer *buf = pipe_head_buf(pipe);
+ size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced);
+
+ *buf = (struct pipe_buffer) {
+ .ops = &page_cache_pipe_buf_ops,
+ .page = page,
+ .offset = offset,
+ .len = part,
+ };
+ folio_get(folio);
+ pipe->head++;
+ page++;
+ spliced += part;
+ offset = 0;
+ }
+
+ return spliced;
+}
+
+/*
+ * Splice folios from the pagecache of a buffered (ie. non-O_DIRECT) file into
+ * a pipe.
+ */
+ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags)
+{
+ struct folio_batch fbatch;
+ struct kiocb iocb;
+ size_t total_spliced = 0, used, npages;
+ loff_t isize, end_offset;
+ bool writably_mapped;
+ int i, error = 0;
+
+ init_sync_kiocb(&iocb, in);
+ iocb.ki_pos = *ppos;
+
+ /* Work out how much data we can actually add into the pipe */
+ used = pipe_occupancy(pipe->head, pipe->tail);
+ npages = max_t(ssize_t, pipe->max_usage - used, 0);
+ len = min_t(size_t, len, npages * PAGE_SIZE);
+
+ folio_batch_init(&fbatch);
+
+ do {
+ cond_resched();
+
+ if (*ppos >= i_size_read(file_inode(in)))
+ break;
+
+ iocb.ki_pos = *ppos;
+ error = filemap_get_pages(&iocb, len, &fbatch, true);
+ if (error < 0)
+ break;
+
+ /*
+ * i_size must be checked after we know the pages are Uptodate.
+ *
+ * Checking i_size after the check allows us to calculate
+ * the correct value for "nr", which means the zero-filled
+ * part of the page is not copied back to userspace (unless
+ * another truncate extends the file - this is desired though).
+ */
+ isize = i_size_read(file_inode(in));
+ if (unlikely(*ppos >= isize))
+ break;
+ end_offset = min_t(loff_t, isize, *ppos + len);
+
+ /*
+ * Once we start copying data, we don't want to be touching any
+ * cachelines that might be contended:
+ */
+ writably_mapped = mapping_writably_mapped(in->f_mapping);
+
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct folio *folio = fbatch.folios[i];
+ size_t n;
+
+ if (folio_pos(folio) >= end_offset)
+ goto out;
+ folio_mark_accessed(folio);
+
+ /*
+ * If users can be writing to this folio using arbitrary
+ * virtual addresses, take care of potential aliasing
+ * before reading the folio on the kernel side.
+ */
+ if (writably_mapped)
+ flush_dcache_folio(folio);
+
+ n = min_t(loff_t, len, isize - *ppos);
+ n = splice_folio_into_pipe(pipe, folio, *ppos, n);
+ if (!n)
+ goto out;
+ len -= n;
+ total_spliced += n;
+ *ppos += n;
+ in->f_ra.prev_pos = *ppos;
+ if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ goto out;
+ }
+
+ folio_batch_release(&fbatch);
+ } while (len);
+
+out:
+ folio_batch_release(&fbatch);
+ file_accessed(in);
+
+ return total_spliced ? total_spliced : error;
+}
+EXPORT_SYMBOL(filemap_splice_read);
+
static inline loff_t folio_seek_hole_data(struct xa_state *xas,
struct address_space *mapping, struct folio *folio,
loff_t start, loff_t end, bool seek_data)
diff --git a/mm/internal.h b/mm/internal.h
index bcf75a8b032d..6d4ca98f3844 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -795,6 +795,12 @@ struct migration_target_control {
};
/*
+ * mm/filemap.c
+ */
+size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
+ struct folio *folio, loff_t fpos, size_t size);
+
+/*
* mm/vmalloc.c
*/
#ifdef CONFIG_MMU
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 73afff8062f9..49f40730e711 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -88,6 +88,9 @@ static bool cgroup_memory_nosocket __ro_after_init;
/* Kernel memory accounting disabled? */
static bool cgroup_memory_nokmem __ro_after_init;
+/* BPF memory accounting disabled? */
+static bool cgroup_memory_nobpf __ro_after_init;
+
#ifdef CONFIG_CGROUP_WRITEBACK
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
#endif
@@ -347,6 +350,9 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
*/
DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
EXPORT_SYMBOL(memcg_kmem_enabled_key);
+
+DEFINE_STATIC_KEY_FALSE(memcg_bpf_enabled_key);
+EXPORT_SYMBOL(memcg_bpf_enabled_key);
#endif
/**
@@ -5357,6 +5363,11 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_inc(&memcg_sockets_enabled_key);
+#if defined(CONFIG_MEMCG_KMEM)
+ if (!cgroup_memory_nobpf)
+ static_branch_inc(&memcg_bpf_enabled_key);
+#endif
+
return &memcg->css;
}
@@ -5441,6 +5452,11 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
static_branch_dec(&memcg_sockets_enabled_key);
+#if defined(CONFIG_MEMCG_KMEM)
+ if (!cgroup_memory_nobpf)
+ static_branch_dec(&memcg_bpf_enabled_key);
+#endif
+
vmpressure_cleanup(&memcg->vmpressure);
cancel_work_sync(&memcg->high_work);
mem_cgroup_remove_from_trees(memcg);
@@ -7269,6 +7285,8 @@ static int __init cgroup_memory(char *s)
cgroup_memory_nosocket = true;
if (!strcmp(token, "nokmem"))
cgroup_memory_nokmem = true;
+ if (!strcmp(token, "nobpf"))
+ cgroup_memory_nobpf = true;
}
return 1;
}
diff --git a/mm/slab.c b/mm/slab.c
index 29300fc1289a..74ece29e3a7e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -220,7 +220,6 @@ static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
static inline void fixup_slab_list(struct kmem_cache *cachep,
struct kmem_cache_node *n, struct slab *slab,
void **list);
-static int slab_early_init = 1;
#define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node))
@@ -1249,8 +1248,6 @@ void __init kmem_cache_init(void)
slab_state = PARTIAL_NODE;
setup_kmalloc_cache_index_table();
- slab_early_init = 0;
-
/* 5) Replace the bootstrap kmem_cache_node */
{
int nid;
@@ -1389,7 +1386,7 @@ static void kmem_freepages(struct kmem_cache *cachep, struct slab *slab)
BUG_ON(!folio_test_slab(folio));
__slab_clear_pfmemalloc(slab);
- page_mapcount_reset(folio_page(folio, 0));
+ page_mapcount_reset(&folio->page);
folio->mapping = NULL;
/* Make the mapping reset visible before clearing the flag */
smp_wmb();
@@ -1398,7 +1395,7 @@ static void kmem_freepages(struct kmem_cache *cachep, struct slab *slab)
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += 1 << order;
unaccount_slab(slab, order, cachep);
- __free_pages(folio_page(folio, 0), order);
+ __free_pages(&folio->page, order);
}
static void kmem_rcu_free(struct rcu_head *head)
@@ -1413,13 +1410,10 @@ static void kmem_rcu_free(struct rcu_head *head)
}
#if DEBUG
-static bool is_debug_pagealloc_cache(struct kmem_cache *cachep)
+static inline bool is_debug_pagealloc_cache(struct kmem_cache *cachep)
{
- if (debug_pagealloc_enabled_static() && OFF_SLAB(cachep) &&
- (cachep->size % PAGE_SIZE) == 0)
- return true;
-
- return false;
+ return debug_pagealloc_enabled_static() && OFF_SLAB(cachep) &&
+ ((cachep->size % PAGE_SIZE) == 0);
}
#ifdef CONFIG_DEBUG_PAGEALLOC
@@ -3479,14 +3473,15 @@ cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags,
int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
void **p)
{
- size_t i;
struct obj_cgroup *objcg = NULL;
+ unsigned long irqflags;
+ size_t i;
s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags);
if (!s)
return 0;
- local_irq_disable();
+ local_irq_save(irqflags);
for (i = 0; i < size; i++) {
void *objp = kfence_alloc(s, s->object_size, flags) ?:
__do_cache_alloc(s, flags, NUMA_NO_NODE);
@@ -3495,7 +3490,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
goto error;
p[i] = objp;
}
- local_irq_enable();
+ local_irq_restore(irqflags);
cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_);
@@ -3508,7 +3503,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
/* FIXME: Trace call missing. Christoph would like a bulk variant */
return size;
error:
- local_irq_enable();
+ local_irq_restore(irqflags);
cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_);
slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size);
kmem_cache_free_bulk(s, i, p);
@@ -3610,8 +3605,9 @@ EXPORT_SYMBOL(kmem_cache_free);
void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
{
+ unsigned long flags;
- local_irq_disable();
+ local_irq_save(flags);
for (int i = 0; i < size; i++) {
void *objp = p[i];
struct kmem_cache *s;
@@ -3621,9 +3617,9 @@ void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
/* called via kfree_bulk */
if (!folio_test_slab(folio)) {
- local_irq_enable();
+ local_irq_restore(flags);
free_large_kmalloc(folio, objp);
- local_irq_disable();
+ local_irq_save(flags);
continue;
}
s = folio_slab(folio)->slab_cache;
@@ -3640,7 +3636,7 @@ void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
__cache_free(s, objp, _RET_IP_);
}
- local_irq_enable();
+ local_irq_restore(flags);
/* FIXME: add tracing */
}
diff --git a/mm/slub.c b/mm/slub.c
index 13459c69095a..1013834fb7bb 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2066,7 +2066,7 @@ static void __free_slab(struct kmem_cache *s, struct slab *slab)
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += pages;
unaccount_slab(slab, order, s);
- __free_pages(folio_page(folio, 0), order);
+ __free_pages(&folio->page, order);
}
static void rcu_free_slab(struct rcu_head *h)
@@ -3913,6 +3913,7 @@ static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
size_t size, void **p, struct obj_cgroup *objcg)
{
struct kmem_cache_cpu *c;
+ unsigned long irqflags;
int i;
/*
@@ -3921,7 +3922,7 @@ static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
* handlers invoking normal fastpath.
*/
c = slub_get_cpu_ptr(s->cpu_slab);
- local_lock_irq(&s->cpu_slab->lock);
+ local_lock_irqsave(&s->cpu_slab->lock, irqflags);
for (i = 0; i < size; i++) {
void *object = kfence_alloc(s, s->object_size, flags);
@@ -3942,7 +3943,7 @@ static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
*/
c->tid = next_tid(c->tid);
- local_unlock_irq(&s->cpu_slab->lock);
+ local_unlock_irqrestore(&s->cpu_slab->lock, irqflags);
/*
* Invoking slow path likely have side-effect
@@ -3956,7 +3957,7 @@ static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
c = this_cpu_ptr(s->cpu_slab);
maybe_wipe_obj_freeptr(s, p[i]);
- local_lock_irq(&s->cpu_slab->lock);
+ local_lock_irqsave(&s->cpu_slab->lock, irqflags);
continue; /* goto for-loop */
}
@@ -3965,7 +3966,7 @@ static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
maybe_wipe_obj_freeptr(s, p[i]);
}
c->tid = next_tid(c->tid);
- local_unlock_irq(&s->cpu_slab->lock);
+ local_unlock_irqrestore(&s->cpu_slab->lock, irqflags);
slub_put_cpu_ptr(s->cpu_slab);
return i;
@@ -6449,7 +6450,7 @@ static void debugfs_slab_add(struct kmem_cache *s)
void debugfs_slab_release(struct kmem_cache *s)
{
- debugfs_remove_recursive(debugfs_lookup(s->name, slab_debugfs_root));
+ debugfs_lookup_and_remove(s->name, slab_debugfs_root);
}
static int __init slab_debugfs_init(void)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ca71de7c9d77..61f5bec0f2b6 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -656,6 +656,7 @@ int is_vmalloc_or_module_addr(const void *x)
#endif
return is_vmalloc_addr(x);
}
+EXPORT_SYMBOL_GPL(is_vmalloc_or_module_addr);
/*
* Walk a vmap address to the struct page it maps. Huge vmap mappings will