From 20b5c30398639b458371c228abfda829854b61c5 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 14 Jan 2016 15:18:08 -0800 Subject: Revert "gfp: add __GFP_NOACCOUNT" This reverts commit 8f4fc071b192 ("gfp: add __GFP_NOACCOUNT"). Black-list kmem accounting policy (aka __GFP_NOACCOUNT) turned out to be fragile and difficult to maintain, because there seem to be many more allocations that should not be accounted than those that should be. Besides, false accounting an allocation might result in much worse consequences than not accounting at all, namely increased memory consumption due to pinned dead kmem caches. So it was decided to switch to the white-list policy. This patch reverts bits introducing the black-list policy. The white-list policy will be introduced later in the series. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Cc: Greg Thelen Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index cd0e2413c358..2103f36b3bd3 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -773,8 +773,6 @@ static inline bool __memcg_kmem_bypass(gfp_t gfp) { if (!memcg_kmem_enabled()) return true; - if (gfp & __GFP_NOACCOUNT) - return true; if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) return true; return false; -- cgit v1.2.3 From a9bb7e620efdfd29b6d1c238041173e411670996 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 14 Jan 2016 15:18:12 -0800 Subject: memcg: only account kmem allocations marked as __GFP_ACCOUNT Black-list kmem accounting policy (aka __GFP_NOACCOUNT) turned out to be fragile and difficult to maintain, because there seem to be many more allocations that should not be accounted than those that should be. Besides, false accounting an allocation might result in much worse consequences than not accounting at all, namely increased memory consumption due to pinned dead kmem caches. So this patch switches kmem accounting to the white-policy: now only those kmem allocations that are marked as __GFP_ACCOUNT are accounted to memcg. Currently, no kmem allocations are marked like this. The following patches will mark several kmem allocations that are known to be easily triggered from userspace and therefore should be accounted to memcg. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Cc: Greg Thelen Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 9 +++++++++ include/linux/memcontrol.h | 2 ++ mm/page_alloc.c | 3 ++- 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 075b014448f5..1dd59abe541d 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -30,6 +30,7 @@ struct vm_area_struct; #define ___GFP_HARDWALL 0x20000u #define ___GFP_THISNODE 0x40000u #define ___GFP_ATOMIC 0x80000u +#define ___GFP_ACCOUNT 0x100000u #define ___GFP_NOTRACK 0x200000u #define ___GFP_DIRECT_RECLAIM 0x400000u #define ___GFP_OTHER_NODE 0x800000u @@ -72,11 +73,15 @@ struct vm_area_struct; * * __GFP_THISNODE forces the allocation to be satisified from the requested * node with no fallbacks or placement policy enforcements. + * + * __GFP_ACCOUNT causes the allocation to be accounted to kmemcg (only relevant + * to kmem allocations). */ #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) #define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL) #define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE) +#define __GFP_ACCOUNT ((__force gfp_t)___GFP_ACCOUNT) /* * Watermark modifiers -- controls access to emergency reserves @@ -195,6 +200,9 @@ struct vm_area_struct; * GFP_KERNEL is typical for kernel-internal allocations. The caller requires * ZONE_NORMAL or a lower zone for direct access but can direct reclaim. * + * GFP_KERNEL_ACCOUNT is the same as GFP_KERNEL, except the allocation is + * accounted to kmemcg. + * * GFP_NOWAIT is for kernel allocations that should not stall for direct * reclaim, start physical IO or use any filesystem callback. * @@ -234,6 +242,7 @@ struct vm_area_struct; */ #define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) +#define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT) #define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) #define GFP_NOIO (__GFP_RECLAIM) #define GFP_NOFS (__GFP_RECLAIM | __GFP_IO) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2103f36b3bd3..c9d9a8e7b45f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -773,6 +773,8 @@ static inline bool __memcg_kmem_bypass(gfp_t gfp) { if (!memcg_kmem_enabled()) return true; + if (!(gfp & __GFP_ACCOUNT)) + return true; if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) return true; return false; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9d666df5ef95..ca58bfcdadac 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3402,7 +3402,8 @@ EXPORT_SYMBOL(__free_page_frag); /* * alloc_kmem_pages charges newly allocated pages to the kmem resource counter - * of the current memory cgroup. + * of the current memory cgroup if __GFP_ACCOUNT is set, other than that it is + * equivalent to alloc_pages. * * It should be used when the caller would like to use kmalloc, but since the * allocation is large, it has to fall back to the page allocator. -- cgit v1.2.3 From 230e9fc2860450fbb1f33bdcf9093d92d7d91f5b Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 14 Jan 2016 15:18:15 -0800 Subject: slab: add SLAB_ACCOUNT flag Currently, if we want to account all objects of a particular kmem cache, we have to pass __GFP_ACCOUNT to each kmem_cache_alloc call, which is inconvenient. This patch introduces SLAB_ACCOUNT flag which if passed to kmem_cache_create will force accounting for every allocation from this cache even if __GFP_ACCOUNT is not passed. This patch does not make any of the existing caches use this flag - it will be done later in the series. Note, a cache with SLAB_ACCOUNT cannot be merged with a cache w/o SLAB_ACCOUNT, because merged caches share the same kmem_cache struct and hence cannot have different sets of SLAB_* flags. Thus using this flag will probably reduce the number of merged slabs even if kmem accounting is not used (only compiled in). Signed-off-by: Vladimir Davydov Suggested-by: Tejun Heo Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Greg Thelen Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 15 +++++++-------- include/linux/slab.h | 5 +++++ mm/memcontrol.c | 8 +++++++- mm/slab.h | 5 +++-- mm/slab_common.c | 3 ++- mm/slub.c | 2 ++ 6 files changed, 26 insertions(+), 12 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index c9d9a8e7b45f..5c97265c1c6e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -766,15 +766,13 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) return memcg ? memcg->kmemcg_id : -1; } -struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep); +struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp); void __memcg_kmem_put_cache(struct kmem_cache *cachep); -static inline bool __memcg_kmem_bypass(gfp_t gfp) +static inline bool __memcg_kmem_bypass(void) { if (!memcg_kmem_enabled()) return true; - if (!(gfp & __GFP_ACCOUNT)) - return true; if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) return true; return false; @@ -791,7 +789,9 @@ static inline bool __memcg_kmem_bypass(gfp_t gfp) static __always_inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) { - if (__memcg_kmem_bypass(gfp)) + if (__memcg_kmem_bypass()) + return 0; + if (!(gfp & __GFP_ACCOUNT)) return 0; return __memcg_kmem_charge(page, gfp, order); } @@ -810,16 +810,15 @@ static __always_inline void memcg_kmem_uncharge(struct page *page, int order) /** * memcg_kmem_get_cache: selects the correct per-memcg cache for allocation * @cachep: the original global kmem cache - * @gfp: allocation flags. * * All memory allocated from a per-memcg cache is charged to the owner memcg. */ static __always_inline struct kmem_cache * memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) { - if (__memcg_kmem_bypass(gfp)) + if (__memcg_kmem_bypass()) return cachep; - return __memcg_kmem_get_cache(cachep); + return __memcg_kmem_get_cache(cachep, gfp); } static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep) diff --git a/include/linux/slab.h b/include/linux/slab.h index 2037a861e367..3ffee7422012 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -86,6 +86,11 @@ #else # define SLAB_FAILSLAB 0x00000000UL #endif +#ifdef CONFIG_MEMCG_KMEM +# define SLAB_ACCOUNT 0x04000000UL /* Account to memcg */ +#else +# define SLAB_ACCOUNT 0x00000000UL +#endif /* The following flags affect the page allocator grouping pages by mobility */ #define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 14cb1db4c52b..4bd6c4513393 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2356,7 +2356,7 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, * Can't be called in interrupt context or from kernel threads. * This function needs to be called with rcu_read_lock() held. */ -struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) +struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) { struct mem_cgroup *memcg; struct kmem_cache *memcg_cachep; @@ -2364,6 +2364,12 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) VM_BUG_ON(!is_root_cache(cachep)); + if (cachep->flags & SLAB_ACCOUNT) + gfp |= __GFP_ACCOUNT; + + if (!(gfp & __GFP_ACCOUNT)) + return cachep; + if (current->memcg_kmem_skip_account) return cachep; diff --git a/mm/slab.h b/mm/slab.h index 7b6087197997..c63b8699cfa3 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -128,10 +128,11 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, #if defined(CONFIG_SLAB) #define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \ - SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | SLAB_NOTRACK) + SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \ + SLAB_NOTRACK | SLAB_ACCOUNT) #elif defined(CONFIG_SLUB) #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_TEMPORARY | SLAB_NOTRACK) + SLAB_TEMPORARY | SLAB_NOTRACK | SLAB_ACCOUNT) #else #define SLAB_CACHE_FLAGS (0) #endif diff --git a/mm/slab_common.c b/mm/slab_common.c index 3c6a86b4ec25..e016178063e1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -37,7 +37,8 @@ struct kmem_cache *kmem_cache; SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ SLAB_FAILSLAB) -#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | SLAB_NOTRACK) +#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ + SLAB_NOTRACK | SLAB_ACCOUNT) /* * Merge control. If this is set then no merging of slab caches will occur. diff --git a/mm/slub.c b/mm/slub.c index 46997517406e..2d0e610d195a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5362,6 +5362,8 @@ static char *create_unique_id(struct kmem_cache *s) *p++ = 'F'; if (!(s->flags & SLAB_NOTRACK)) *p++ = 't'; + if (s->flags & SLAB_ACCOUNT) + *p++ = 'A'; if (p != name + 1) *p++ = '-'; p += sprintf(p, "%07d", s->size); -- cgit v1.2.3 From 9ee11ba4251dddf1b0e507d184b25b1bd7820773 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 14 Jan 2016 15:19:41 -0800 Subject: memcg: do not allow to disable tcp accounting after limit is set There are two bits defined for cg_proto->flags - MEMCG_SOCK_ACTIVATED and MEMCG_SOCK_ACTIVE - both are set in tcp_update_limit, but the former is never cleared while the latter can be cleared by unsetting the limit. This allows to disable tcp socket accounting for new sockets after it was enabled by writing -1 to memory.kmem.tcp.limit_in_bytes while still guaranteeing that memcg_socket_limit_enabled static key will be decremented on memcg destruction. This functionality looks dubious, because it is not clear what a use case would be. By enabling tcp accounting a user accepts the price. If they then find the performance degradation unacceptable, they can always restart their workload with tcp accounting disabled. It does not seem there is any need to flip it while the workload is running. Besides, it contradicts to how kmem accounting API works: writing whatever to memory.kmem.limit_in_bytes enables kmem accounting for the cgroup in question, after which it cannot be disabled. Therefore one might expect that writing -1 to memory.kmem.tcp.limit_in_bytes just enables socket accounting w/o limiting it, which might be useful by itself, but it isn't true. Since this API peculiarity is not documented anywhere, I propose to drop it. This will allow to simplify the code by dropping cg_proto->flags. Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 12 +----------- mm/memcontrol.c | 2 +- net/ipv4/tcp_memcontrol.c | 17 +++++------------ 3 files changed, 7 insertions(+), 24 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5c97265c1c6e..78a1ec2e23fc 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -85,22 +85,12 @@ enum mem_cgroup_events_target { MEM_CGROUP_NTARGETS, }; -/* - * Bits in struct cg_proto.flags - */ -enum cg_proto_flags { - /* Currently active and new sockets should be assigned to cgroups */ - MEMCG_SOCK_ACTIVE, - /* It was ever activated; we must disarm static keys on destruction */ - MEMCG_SOCK_ACTIVATED, -}; - struct cg_proto { struct page_counter memory_allocated; /* Current allocated memory. */ struct percpu_counter sockets_allocated; /* Current number of sockets. */ int memory_pressure; + bool active; long sysctl_mem[3]; - unsigned long flags; /* * memcg field is used to find which memcg we belong directly * Each memcg struct can hold more than one cg_proto, so container_of diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4bd6c4513393..0bc140d998ad 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -316,7 +316,7 @@ void sock_update_memcg(struct sock *sk) rcu_read_lock(); memcg = mem_cgroup_from_task(current); cg_proto = sk->sk_prot->proto_cgroup(memcg); - if (cg_proto && test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags) && + if (cg_proto && cg_proto->active && css_tryget_online(&memcg->css)) { sk->sk_cgrp = cg_proto; } diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 2379c1b4efb2..d07579ada001 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -48,7 +48,7 @@ void tcp_destroy_cgroup(struct mem_cgroup *memcg) percpu_counter_destroy(&cg_proto->sockets_allocated); - if (test_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags)) + if (cg_proto->active) static_key_slow_dec(&memcg_socket_limit_enabled); } @@ -72,11 +72,9 @@ static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages) cg_proto->sysctl_mem[i] = min_t(long, nr_pages, sysctl_tcp_mem[i]); - if (nr_pages == PAGE_COUNTER_MAX) - clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); - else { + if (!cg_proto->active) { /* - * The active bit needs to be written after the static_key + * The active flag needs to be written after the static_key * update. This is what guarantees that the socket activation * function is the last one to run. See sock_update_memcg() for * details, and note that we don't mark any socket as belonging @@ -90,14 +88,9 @@ static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages) * We never race with the readers in sock_update_memcg(), * because when this value change, the code to process it is not * patched in yet. - * - * The activated bit is used to guarantee that no two writers - * will do the update in the same memcg. Without that, we can't - * properly shutdown the static key. */ - if (!test_and_set_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags)) - static_key_slow_inc(&memcg_socket_limit_enabled); - set_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); + static_key_slow_inc(&memcg_socket_limit_enabled); + cg_proto->active = true; } return 0; -- cgit v1.2.3 From 7d828602e5ef3297a69392a2d31264e4ab9c8bb7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Jan 2016 15:20:56 -0800 Subject: mm: memcontrol: export root_mem_cgroup A later patch will need this symbol in files other than memcontrol.c, so export it now and replace mem_cgroup_root_css at the same time. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: David S. Miller Reviewed-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 3 ++- mm/backing-dev.c | 2 +- mm/memcontrol.c | 5 ++--- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 78a1ec2e23fc..d0c724f53691 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -265,7 +265,8 @@ struct mem_cgroup { struct mem_cgroup_per_node *nodeinfo[0]; /* WARNING: nodeinfo must be the last member here */ }; -extern struct cgroup_subsys_state *mem_cgroup_root_css; + +extern struct mem_cgroup *root_mem_cgroup; /** * mem_cgroup_events - count memory events against a cgroup diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 7340353f8aea..cc5d29d2da9b 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -672,7 +672,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); if (!ret) { - bdi->wb.memcg_css = mem_cgroup_root_css; + bdi->wb.memcg_css = &root_mem_cgroup->css; bdi->wb.blkcg_css = blkcg_root_css; } return ret; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0bc140d998ad..44ed2dee8f0c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -76,9 +76,9 @@ struct cgroup_subsys memory_cgrp_subsys __read_mostly; EXPORT_SYMBOL(memory_cgrp_subsys); +struct mem_cgroup *root_mem_cgroup __read_mostly; + #define MEM_CGROUP_RECLAIM_RETRIES 5 -static struct mem_cgroup *root_mem_cgroup __read_mostly; -struct cgroup_subsys_state *mem_cgroup_root_css __read_mostly; /* Whether the swap controller is active */ #ifdef CONFIG_MEMCG_SWAP @@ -4241,7 +4241,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) /* root ? */ if (parent_css == NULL) { root_mem_cgroup = memcg; - mem_cgroup_root_css = &memcg->css; page_counter_init(&memcg->memory, NULL); memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; -- cgit v1.2.3 From 3d596f7b907b0281b997cf30c92994a71ad0a1a9 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Jan 2016 15:21:05 -0800 Subject: net: tcp_memcontrol: protect all tcp_memcontrol calls by jump-label Move the jump-label from sock_update_memcg() and sock_release_memcg() to the callsite, and so eliminate those function calls when socket accounting is not enabled. This also eliminates the need for dummy functions because the calls will be optimized away if the Kconfig options are not enabled. Signed-off-by: Johannes Weiner Acked-by: David S. Miller Reviewed-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 9 -------- mm/memcontrol.c | 56 +++++++++++++++++++++------------------------- net/core/sock.c | 9 ++------ net/ipv4/tcp.c | 3 ++- net/ipv4/tcp_ipv4.c | 4 +++- 5 files changed, 32 insertions(+), 49 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d0c724f53691..85c437b0cbc0 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -694,17 +694,8 @@ static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb, #endif /* CONFIG_CGROUP_WRITEBACK */ struct sock; -#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) void sock_update_memcg(struct sock *sk); void sock_release_memcg(struct sock *sk); -#else -static inline void sock_update_memcg(struct sock *sk) -{ -} -static inline void sock_release_memcg(struct sock *sk) -{ -} -#endif /* CONFIG_INET && CONFIG_MEMCG_KMEM */ #ifdef CONFIG_MEMCG_KMEM extern struct static_key memcg_kmem_enabled_key; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 44ed2dee8f0c..d9344dad207e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -293,46 +293,40 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) void sock_update_memcg(struct sock *sk) { - if (mem_cgroup_sockets_enabled) { - struct mem_cgroup *memcg; - struct cg_proto *cg_proto; + struct mem_cgroup *memcg; + struct cg_proto *cg_proto; - BUG_ON(!sk->sk_prot->proto_cgroup); + BUG_ON(!sk->sk_prot->proto_cgroup); - /* Socket cloning can throw us here with sk_cgrp already - * filled. It won't however, necessarily happen from - * process context. So the test for root memcg given - * the current task's memcg won't help us in this case. - * - * Respecting the original socket's memcg is a better - * decision in this case. - */ - if (sk->sk_cgrp) { - BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); - css_get(&sk->sk_cgrp->memcg->css); - return; - } + /* Socket cloning can throw us here with sk_cgrp already + * filled. It won't however, necessarily happen from + * process context. So the test for root memcg given + * the current task's memcg won't help us in this case. + * + * Respecting the original socket's memcg is a better + * decision in this case. + */ + if (sk->sk_cgrp) { + BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); + css_get(&sk->sk_cgrp->memcg->css); + return; + } - rcu_read_lock(); - memcg = mem_cgroup_from_task(current); - cg_proto = sk->sk_prot->proto_cgroup(memcg); - if (cg_proto && cg_proto->active && - css_tryget_online(&memcg->css)) { - sk->sk_cgrp = cg_proto; - } - rcu_read_unlock(); + rcu_read_lock(); + memcg = mem_cgroup_from_task(current); + cg_proto = sk->sk_prot->proto_cgroup(memcg); + if (cg_proto && cg_proto->active && + css_tryget_online(&memcg->css)) { + sk->sk_cgrp = cg_proto; } + rcu_read_unlock(); } EXPORT_SYMBOL(sock_update_memcg); void sock_release_memcg(struct sock *sk) { - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) { - struct mem_cgroup *memcg; - WARN_ON(!sk->sk_cgrp->memcg); - memcg = sk->sk_cgrp->memcg; - css_put(&sk->sk_cgrp->memcg->css); - } + WARN_ON(!sk->sk_cgrp->memcg); + css_put(&sk->sk_cgrp->memcg->css); } struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) diff --git a/net/core/sock.c b/net/core/sock.c index 51270238e269..6c5dab01105b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1507,12 +1507,6 @@ void sk_free(struct sock *sk) } EXPORT_SYMBOL(sk_free); -static void sk_update_clone(const struct sock *sk, struct sock *newsk) -{ - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) - sock_update_memcg(newsk); -} - /** * sk_clone_lock - clone a socket, and lock its clone * @sk: the socket to clone @@ -1607,7 +1601,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sk_set_socket(newsk, NULL); newsk->sk_wq = NULL; - sk_update_clone(sk, newsk); + if (mem_cgroup_sockets_enabled && sk->sk_cgrp) + sock_update_memcg(newsk); if (newsk->sk_prot->sockets_allocated) sk_sockets_allocated_inc(newsk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7bb1b091efd1..fd17eec93525 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -422,7 +422,8 @@ void tcp_init_sock(struct sock *sk) sk->sk_rcvbuf = sysctl_tcp_rmem[1]; local_bh_disable(); - sock_update_memcg(sk); + if (mem_cgroup_sockets_enabled) + sock_update_memcg(sk); sk_sockets_allocated_inc(sk); local_bh_enable(); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 65947c1f4733..eb39e02899e5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1818,7 +1818,9 @@ void tcp_v4_destroy_sock(struct sock *sk) tcp_saved_syn_free(tp); sk_sockets_allocated_dec(sk); - sock_release_memcg(sk); + + if (mem_cgroup_sockets_enabled && sk->sk_cgrp) + sock_release_memcg(sk); } EXPORT_SYMBOL(tcp_v4_destroy_sock); -- cgit v1.2.3 From af95d7df4059cfeab7e7c244f3564214aada7dad Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Jan 2016 15:21:08 -0800 Subject: net: tcp_memcontrol: remove dead per-memcg count of allocated sockets The number of allocated sockets is used for calculations in the soft limit phase, where packets are accepted but the socket is under memory pressure. Since there is no soft limit phase in tcp_memcontrol, and memory pressure is only entered when packets are already dropped, this is actually dead code. Remove it. As this is the last user of parent_cg_proto(), remove that too. Signed-off-by: Johannes Weiner Acked-by: David S. Miller Reviewed-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 1 - include/net/sock.h | 39 +++------------------------------------ net/ipv4/tcp_memcontrol.c | 3 --- 3 files changed, 3 insertions(+), 40 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 85c437b0cbc0..15acc04ebdd3 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -87,7 +87,6 @@ enum mem_cgroup_events_target { struct cg_proto { struct page_counter memory_allocated; /* Current allocated memory. */ - struct percpu_counter sockets_allocated; /* Current number of sockets. */ int memory_pressure; bool active; long sysctl_mem[3]; diff --git a/include/net/sock.h b/include/net/sock.h index d3b035c7362b..1f15937ec208 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1098,19 +1098,9 @@ static inline void sk_refcnt_debug_release(const struct sock *sk) #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_NET) extern struct static_key memcg_socket_limit_enabled; -static inline struct cg_proto *parent_cg_proto(struct proto *proto, - struct cg_proto *cg_proto) -{ - return proto->proto_cgroup(parent_mem_cgroup(cg_proto->memcg)); -} #define mem_cgroup_sockets_enabled static_key_false(&memcg_socket_limit_enabled) #else #define mem_cgroup_sockets_enabled 0 -static inline struct cg_proto *parent_cg_proto(struct proto *proto, - struct cg_proto *cg_proto) -{ - return NULL; -} #endif static inline bool sk_stream_memory_free(const struct sock *sk) @@ -1236,41 +1226,18 @@ sk_memory_allocated_sub(struct sock *sk, int amt) static inline void sk_sockets_allocated_dec(struct sock *sk) { - struct proto *prot = sk->sk_prot; - - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) { - struct cg_proto *cg_proto = sk->sk_cgrp; - - for (; cg_proto; cg_proto = parent_cg_proto(prot, cg_proto)) - percpu_counter_dec(&cg_proto->sockets_allocated); - } - - percpu_counter_dec(prot->sockets_allocated); + percpu_counter_dec(sk->sk_prot->sockets_allocated); } static inline void sk_sockets_allocated_inc(struct sock *sk) { - struct proto *prot = sk->sk_prot; - - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) { - struct cg_proto *cg_proto = sk->sk_cgrp; - - for (; cg_proto; cg_proto = parent_cg_proto(prot, cg_proto)) - percpu_counter_inc(&cg_proto->sockets_allocated); - } - - percpu_counter_inc(prot->sockets_allocated); + percpu_counter_inc(sk->sk_prot->sockets_allocated); } static inline int sk_sockets_allocated_read_positive(struct sock *sk) { - struct proto *prot = sk->sk_prot; - - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) - return percpu_counter_read_positive(&sk->sk_cgrp->sockets_allocated); - - return percpu_counter_read_positive(prot->sockets_allocated); + return percpu_counter_read_positive(sk->sk_prot->sockets_allocated); } static inline int diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index d07579ada001..6759e0d6bba1 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -32,7 +32,6 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) counter_parent = &parent_cg->memory_allocated; page_counter_init(&cg_proto->memory_allocated, counter_parent); - percpu_counter_init(&cg_proto->sockets_allocated, 0, GFP_KERNEL); return 0; } @@ -46,8 +45,6 @@ void tcp_destroy_cgroup(struct mem_cgroup *memcg) if (!cg_proto) return; - percpu_counter_destroy(&cg_proto->sockets_allocated); - if (cg_proto->active) static_key_slow_dec(&memcg_socket_limit_enabled); -- cgit v1.2.3 From 80f23124f57c77915a7b4201d8dcba38a38b23f0 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Jan 2016 15:21:11 -0800 Subject: net: tcp_memcontrol: simplify the per-memcg limit access tcp_memcontrol replicates the global sysctl_mem limit array per cgroup, but it only ever sets these entries to the value of the memory_allocated page_counter limit. Use the latter directly. Signed-off-by: Johannes Weiner Reviewed-by: Vladimir Davydov Acked-by: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 1 - include/net/sock.h | 8 +++++--- net/ipv4/tcp_memcontrol.c | 8 -------- 3 files changed, 5 insertions(+), 12 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 15acc04ebdd3..6c91c1b73951 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -89,7 +89,6 @@ struct cg_proto { struct page_counter memory_allocated; /* Current allocated memory. */ int memory_pressure; bool active; - long sysctl_mem[3]; /* * memcg field is used to find which memcg we belong directly * Each memcg struct can hold more than one cg_proto, so container_of diff --git a/include/net/sock.h b/include/net/sock.h index 1f15937ec208..8b1f8e5d3a48 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1162,10 +1162,12 @@ static inline void sk_enter_memory_pressure(struct sock *sk) static inline long sk_prot_mem_limits(const struct sock *sk, int index) { - long *prot = sk->sk_prot->sysctl_mem; + long limit = sk->sk_prot->sysctl_mem[index]; + if (mem_cgroup_sockets_enabled && sk->sk_cgrp) - prot = sk->sk_cgrp->sysctl_mem; - return prot[index]; + limit = min_t(long, limit, sk->sk_cgrp->memory_allocated.limit); + + return limit; } static inline void memcg_memory_allocated_add(struct cg_proto *prot, diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 6759e0d6bba1..ef4268d12e43 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -21,9 +21,6 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) if (!cg_proto) return 0; - cg_proto->sysctl_mem[0] = sysctl_tcp_mem[0]; - cg_proto->sysctl_mem[1] = sysctl_tcp_mem[1]; - cg_proto->sysctl_mem[2] = sysctl_tcp_mem[2]; cg_proto->memory_pressure = 0; cg_proto->memcg = memcg; @@ -54,7 +51,6 @@ EXPORT_SYMBOL(tcp_destroy_cgroup); static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages) { struct cg_proto *cg_proto; - int i; int ret; cg_proto = tcp_prot.proto_cgroup(memcg); @@ -65,10 +61,6 @@ static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages) if (ret) return ret; - for (i = 0; i < 3; i++) - cg_proto->sysctl_mem[i] = min_t(long, nr_pages, - sysctl_tcp_mem[i]); - if (!cg_proto->active) { /* * The active flag needs to be written after the static_key -- cgit v1.2.3 From e805605c721021879a1469bdae45c6f80bc985f4 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Jan 2016 15:21:14 -0800 Subject: net: tcp_memcontrol: sanitize tcp memory accounting callbacks There won't be a tcp control soft limit, so integrating the memcg code into the global skmem limiting scheme complicates things unnecessarily. Replace this with simple and clear charge and uncharge calls--hidden behind a jump label--to account skb memory. Note that this is not purely aesthetic: as a result of shoehorning the per-memcg code into the same memory accounting functions that handle the global level, the old code would compare the per-memcg consumption against the smaller of the per-memcg limit and the global limit. This allowed the total consumption of multiple sockets to exceed the global limit, as long as the individual sockets stayed within bounds. After this change, the code will always compare the per-memcg consumption to the per-memcg limit, and the global consumption to the global limit, and thus close this loophole. Without a soft limit, the per-memcg memory pressure state in sockets is generally questionable. However, we did it until now, so we continue to enter it when the hard limit is hit, and packets are dropped, to let other sockets in the cgroup know that they shouldn't grow their transmit windows, either. However, keep it simple in the new callback model and leave memory pressure lazily when the next packet is accepted (as opposed to doing it synchroneously when packets are processed). When packets are dropped, network performance will already be in the toilet, so that should be a reasonable trade-off. As described above, consumption is now checked on the per-memcg level and the global level separately. Likewise, memory pressure states are maintained on both the per-memcg level and the global level, and a socket is considered under pressure when either level asserts as much. Signed-off-by: Johannes Weiner Reviewed-by: Vladimir Davydov Acked-by: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 19 +++++++++----- include/net/sock.h | 64 ++++++---------------------------------------- include/net/tcp.h | 5 ++-- mm/memcontrol.c | 32 +++++++++++++++++++++++ net/core/sock.c | 26 +++++++++++-------- net/ipv4/tcp_output.c | 7 +++-- 6 files changed, 77 insertions(+), 76 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6c91c1b73951..e4e77bd1dd39 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -660,12 +660,6 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) } #endif /* CONFIG_MEMCG */ -enum { - UNDER_LIMIT, - SOFT_LIMIT, - OVER_LIMIT, -}; - #ifdef CONFIG_CGROUP_WRITEBACK struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg); @@ -694,6 +688,19 @@ static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb, struct sock; void sock_update_memcg(struct sock *sk); void sock_release_memcg(struct sock *sk); +bool mem_cgroup_charge_skmem(struct cg_proto *proto, unsigned int nr_pages); +void mem_cgroup_uncharge_skmem(struct cg_proto *proto, unsigned int nr_pages); +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) +static inline bool mem_cgroup_under_socket_pressure(struct cg_proto *proto) +{ + return proto->memory_pressure; +} +#else +static inline bool mem_cgroup_under_pressure(struct cg_proto *proto) +{ + return false; +} +#endif #ifdef CONFIG_MEMCG_KMEM extern struct static_key memcg_kmem_enabled_key; diff --git a/include/net/sock.h b/include/net/sock.h index 8b1f8e5d3a48..94a6c1a740b9 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1129,8 +1129,9 @@ static inline bool sk_under_memory_pressure(const struct sock *sk) if (!sk->sk_prot->memory_pressure) return false; - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) - return !!sk->sk_cgrp->memory_pressure; + if (mem_cgroup_sockets_enabled && sk->sk_cgrp && + mem_cgroup_under_socket_pressure(sk->sk_cgrp)) + return true; return !!*sk->sk_prot->memory_pressure; } @@ -1144,9 +1145,6 @@ static inline void sk_leave_memory_pressure(struct sock *sk) if (*memory_pressure) *memory_pressure = 0; - - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) - sk->sk_cgrp->memory_pressure = 0; } static inline void sk_enter_memory_pressure(struct sock *sk) @@ -1154,76 +1152,30 @@ static inline void sk_enter_memory_pressure(struct sock *sk) if (!sk->sk_prot->enter_memory_pressure) return; - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) - sk->sk_cgrp->memory_pressure = 1; - sk->sk_prot->enter_memory_pressure(sk); } static inline long sk_prot_mem_limits(const struct sock *sk, int index) { - long limit = sk->sk_prot->sysctl_mem[index]; - - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) - limit = min_t(long, limit, sk->sk_cgrp->memory_allocated.limit); - - return limit; -} - -static inline void memcg_memory_allocated_add(struct cg_proto *prot, - unsigned long amt, - int *parent_status) -{ - struct page_counter *counter; - - if (page_counter_try_charge(&prot->memory_allocated, amt, &counter)) - return; - - page_counter_charge(&prot->memory_allocated, amt); - *parent_status = OVER_LIMIT; -} - -static inline void memcg_memory_allocated_sub(struct cg_proto *prot, - unsigned long amt) -{ - page_counter_uncharge(&prot->memory_allocated, amt); + return sk->sk_prot->sysctl_mem[index]; } static inline long sk_memory_allocated(const struct sock *sk) { - struct proto *prot = sk->sk_prot; - - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) - return page_counter_read(&sk->sk_cgrp->memory_allocated); - - return atomic_long_read(prot->memory_allocated); + return atomic_long_read(sk->sk_prot->memory_allocated); } static inline long -sk_memory_allocated_add(struct sock *sk, int amt, int *parent_status) +sk_memory_allocated_add(struct sock *sk, int amt) { - struct proto *prot = sk->sk_prot; - - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) { - memcg_memory_allocated_add(sk->sk_cgrp, amt, parent_status); - /* update the root cgroup regardless */ - atomic_long_add_return(amt, prot->memory_allocated); - return page_counter_read(&sk->sk_cgrp->memory_allocated); - } - - return atomic_long_add_return(amt, prot->memory_allocated); + return atomic_long_add_return(amt, sk->sk_prot->memory_allocated); } static inline void sk_memory_allocated_sub(struct sock *sk, int amt) { - struct proto *prot = sk->sk_prot; - - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) - memcg_memory_allocated_sub(sk->sk_cgrp, amt); - - atomic_long_sub(amt, prot->memory_allocated); + atomic_long_sub(amt, sk->sk_prot->memory_allocated); } static inline void sk_sockets_allocated_dec(struct sock *sk) diff --git a/include/net/tcp.h b/include/net/tcp.h index a80255f4ca33..d9df80deba31 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -289,8 +289,9 @@ extern int tcp_memory_pressure; /* optimized version of sk_under_memory_pressure() for TCP sockets */ static inline bool tcp_under_memory_pressure(const struct sock *sk) { - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) - return !!sk->sk_cgrp->memory_pressure; + if (mem_cgroup_sockets_enabled && sk->sk_cgrp && + mem_cgroup_under_socket_pressure(sk->sk_cgrp)) + return true; return tcp_memory_pressure; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d9344dad207e..f5de783860b8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -338,6 +338,38 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) } EXPORT_SYMBOL(tcp_proto_cgroup); +/** + * mem_cgroup_charge_skmem - charge socket memory + * @proto: proto to charge + * @nr_pages: number of pages to charge + * + * Charges @nr_pages to @proto. Returns %true if the charge fit within + * @proto's configured limit, %false if the charge had to be forced. + */ +bool mem_cgroup_charge_skmem(struct cg_proto *proto, unsigned int nr_pages) +{ + struct page_counter *counter; + + if (page_counter_try_charge(&proto->memory_allocated, + nr_pages, &counter)) { + proto->memory_pressure = 0; + return true; + } + page_counter_charge(&proto->memory_allocated, nr_pages); + proto->memory_pressure = 1; + return false; +} + +/** + * mem_cgroup_uncharge_skmem - uncharge socket memory + * @proto - proto to uncharge + * @nr_pages - number of pages to uncharge + */ +void mem_cgroup_uncharge_skmem(struct cg_proto *proto, unsigned int nr_pages) +{ + page_counter_uncharge(&proto->memory_allocated, nr_pages); +} + #endif #ifdef CONFIG_MEMCG_KMEM diff --git a/net/core/sock.c b/net/core/sock.c index 6c5dab01105b..89ae859d2dc5 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2084,27 +2084,27 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) struct proto *prot = sk->sk_prot; int amt = sk_mem_pages(size); long allocated; - int parent_status = UNDER_LIMIT; sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; - allocated = sk_memory_allocated_add(sk, amt, &parent_status); + allocated = sk_memory_allocated_add(sk, amt); + + if (mem_cgroup_sockets_enabled && sk->sk_cgrp && + !mem_cgroup_charge_skmem(sk->sk_cgrp, amt)) + goto suppress_allocation; /* Under limit. */ - if (parent_status == UNDER_LIMIT && - allocated <= sk_prot_mem_limits(sk, 0)) { + if (allocated <= sk_prot_mem_limits(sk, 0)) { sk_leave_memory_pressure(sk); return 1; } - /* Under pressure. (we or our parents) */ - if ((parent_status > SOFT_LIMIT) || - allocated > sk_prot_mem_limits(sk, 1)) + /* Under pressure. */ + if (allocated > sk_prot_mem_limits(sk, 1)) sk_enter_memory_pressure(sk); - /* Over hard limit (we or our parents) */ - if ((parent_status == OVER_LIMIT) || - (allocated > sk_prot_mem_limits(sk, 2))) + /* Over hard limit. */ + if (allocated > sk_prot_mem_limits(sk, 2)) goto suppress_allocation; /* guarantee minimum buffer size under pressure */ @@ -2153,6 +2153,9 @@ suppress_allocation: sk_memory_allocated_sub(sk, amt); + if (mem_cgroup_sockets_enabled && sk->sk_cgrp) + mem_cgroup_uncharge_skmem(sk->sk_cgrp, amt); + return 0; } EXPORT_SYMBOL(__sk_mem_schedule); @@ -2168,6 +2171,9 @@ void __sk_mem_reclaim(struct sock *sk, int amount) sk_memory_allocated_sub(sk, amount); sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; + if (mem_cgroup_sockets_enabled && sk->sk_cgrp) + mem_cgroup_uncharge_skmem(sk->sk_cgrp, amount); + if (sk_under_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) sk_leave_memory_pressure(sk); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 412a920fe0ec..493b48945f0c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2813,13 +2813,16 @@ begin_fwd: */ void sk_forced_mem_schedule(struct sock *sk, int size) { - int amt, status; + int amt; if (size <= sk->sk_forward_alloc) return; amt = sk_mem_pages(size); sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; - sk_memory_allocated_add(sk, amt, &status); + sk_memory_allocated_add(sk, amt); + + if (mem_cgroup_sockets_enabled && sk->sk_cgrp) + mem_cgroup_charge_skmem(sk->sk_cgrp, amt); } /* Send a FIN. The caller locks the socket for us. -- cgit v1.2.3 From baac50bbc3cdfd184ebf586b1704edbfcee866df Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Jan 2016 15:21:17 -0800 Subject: net: tcp_memcontrol: simplify linkage between socket and page counter There won't be any separate counters for socket memory consumed by protocols other than TCP in the future. Remove the indirection and link sockets directly to their owning memory cgroup. Signed-off-by: Johannes Weiner Reviewed-by: Vladimir Davydov Acked-by: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 20 ++++--------- include/net/sock.h | 25 +++-------------- include/net/tcp.h | 4 +-- include/net/tcp_memcontrol.h | 1 - mm/memcontrol.c | 57 +++++++++++++++---------------------- net/core/sock.c | 52 +++++----------------------------- net/ipv4/tcp_ipv4.c | 7 +---- net/ipv4/tcp_memcontrol.c | 67 +++++++++++++++++--------------------------- net/ipv4/tcp_output.c | 4 +-- net/ipv6/tcp_ipv6.c | 3 -- 10 files changed, 69 insertions(+), 171 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e4e77bd1dd39..7c085e4636ba 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -89,16 +89,6 @@ struct cg_proto { struct page_counter memory_allocated; /* Current allocated memory. */ int memory_pressure; bool active; - /* - * memcg field is used to find which memcg we belong directly - * Each memcg struct can hold more than one cg_proto, so container_of - * won't really cut. - * - * The elegant solution would be having an inverse function to - * proto_cgroup in struct proto, but that means polluting the structure - * for everybody, instead of just for memcg users. - */ - struct mem_cgroup *memcg; }; #ifdef CONFIG_MEMCG @@ -688,15 +678,15 @@ static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb, struct sock; void sock_update_memcg(struct sock *sk); void sock_release_memcg(struct sock *sk); -bool mem_cgroup_charge_skmem(struct cg_proto *proto, unsigned int nr_pages); -void mem_cgroup_uncharge_skmem(struct cg_proto *proto, unsigned int nr_pages); +bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages); +void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages); #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) -static inline bool mem_cgroup_under_socket_pressure(struct cg_proto *proto) +static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { - return proto->memory_pressure; + return memcg->tcp_mem.memory_pressure; } #else -static inline bool mem_cgroup_under_pressure(struct cg_proto *proto) +static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { return false; } diff --git a/include/net/sock.h b/include/net/sock.h index 94a6c1a740b9..be96a8dcbc74 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -71,22 +71,6 @@ #include #include -struct cgroup; -struct cgroup_subsys; -#ifdef CONFIG_NET -int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss); -void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg); -#else -static inline -int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss) -{ - return 0; -} -static inline -void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg) -{ -} -#endif /* * This structure really needs to be cleaned up. * Most of it is for TCP, and not used by any of @@ -245,7 +229,6 @@ struct sock_common { /* public: */ }; -struct cg_proto; /** * struct sock - network layer representation of sockets * @__sk_common: shared layout with inet_timewait_sock @@ -310,7 +293,7 @@ struct cg_proto; * @sk_security: used by security modules * @sk_mark: generic packet mark * @sk_cgrp_data: cgroup data for this cgroup - * @sk_cgrp: this socket's cgroup-specific proto data + * @sk_memcg: this socket's memory cgroup association * @sk_write_pending: a write to stream socket waits to start * @sk_state_change: callback to indicate change in the state of the sock * @sk_data_ready: callback to indicate there is data to be processed @@ -446,7 +429,7 @@ struct sock { void *sk_security; #endif struct sock_cgroup_data sk_cgrp_data; - struct cg_proto *sk_cgrp; + struct mem_cgroup *sk_memcg; void (*sk_state_change)(struct sock *sk); void (*sk_data_ready)(struct sock *sk); void (*sk_write_space)(struct sock *sk); @@ -1129,8 +1112,8 @@ static inline bool sk_under_memory_pressure(const struct sock *sk) if (!sk->sk_prot->memory_pressure) return false; - if (mem_cgroup_sockets_enabled && sk->sk_cgrp && - mem_cgroup_under_socket_pressure(sk->sk_cgrp)) + if (mem_cgroup_sockets_enabled && sk->sk_memcg && + mem_cgroup_under_socket_pressure(sk->sk_memcg)) return true; return !!*sk->sk_prot->memory_pressure; diff --git a/include/net/tcp.h b/include/net/tcp.h index d9df80deba31..8ea19977ea53 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -289,8 +289,8 @@ extern int tcp_memory_pressure; /* optimized version of sk_under_memory_pressure() for TCP sockets */ static inline bool tcp_under_memory_pressure(const struct sock *sk) { - if (mem_cgroup_sockets_enabled && sk->sk_cgrp && - mem_cgroup_under_socket_pressure(sk->sk_cgrp)) + if (mem_cgroup_sockets_enabled && sk->sk_memcg && + mem_cgroup_under_socket_pressure(sk->sk_memcg)) return true; return tcp_memory_pressure; diff --git a/include/net/tcp_memcontrol.h b/include/net/tcp_memcontrol.h index 05b94d9453de..3a17b16ae8aa 100644 --- a/include/net/tcp_memcontrol.h +++ b/include/net/tcp_memcontrol.h @@ -1,7 +1,6 @@ #ifndef _TCP_MEMCG_H #define _TCP_MEMCG_H -struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg); int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss); void tcp_destroy_cgroup(struct mem_cgroup *memcg); #endif /* _TCP_MEMCG_H */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f5de783860b8..eaaa86126277 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -294,9 +294,6 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) void sock_update_memcg(struct sock *sk) { struct mem_cgroup *memcg; - struct cg_proto *cg_proto; - - BUG_ON(!sk->sk_prot->proto_cgroup); /* Socket cloning can throw us here with sk_cgrp already * filled. It won't however, necessarily happen from @@ -306,68 +303,58 @@ void sock_update_memcg(struct sock *sk) * Respecting the original socket's memcg is a better * decision in this case. */ - if (sk->sk_cgrp) { - BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); - css_get(&sk->sk_cgrp->memcg->css); + if (sk->sk_memcg) { + BUG_ON(mem_cgroup_is_root(sk->sk_memcg)); + css_get(&sk->sk_memcg->css); return; } rcu_read_lock(); memcg = mem_cgroup_from_task(current); - cg_proto = sk->sk_prot->proto_cgroup(memcg); - if (cg_proto && cg_proto->active && - css_tryget_online(&memcg->css)) { - sk->sk_cgrp = cg_proto; - } + if (memcg != root_mem_cgroup && + memcg->tcp_mem.active && + css_tryget_online(&memcg->css)) + sk->sk_memcg = memcg; rcu_read_unlock(); } EXPORT_SYMBOL(sock_update_memcg); void sock_release_memcg(struct sock *sk) { - WARN_ON(!sk->sk_cgrp->memcg); - css_put(&sk->sk_cgrp->memcg->css); -} - -struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) -{ - if (!memcg || mem_cgroup_is_root(memcg)) - return NULL; - - return &memcg->tcp_mem; + WARN_ON(!sk->sk_memcg); + css_put(&sk->sk_memcg->css); } -EXPORT_SYMBOL(tcp_proto_cgroup); /** * mem_cgroup_charge_skmem - charge socket memory - * @proto: proto to charge + * @memcg: memcg to charge * @nr_pages: number of pages to charge * - * Charges @nr_pages to @proto. Returns %true if the charge fit within - * @proto's configured limit, %false if the charge had to be forced. + * Charges @nr_pages to @memcg. Returns %true if the charge fit within + * @memcg's configured limit, %false if the charge had to be forced. */ -bool mem_cgroup_charge_skmem(struct cg_proto *proto, unsigned int nr_pages) +bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) { struct page_counter *counter; - if (page_counter_try_charge(&proto->memory_allocated, + if (page_counter_try_charge(&memcg->tcp_mem.memory_allocated, nr_pages, &counter)) { - proto->memory_pressure = 0; + memcg->tcp_mem.memory_pressure = 0; return true; } - page_counter_charge(&proto->memory_allocated, nr_pages); - proto->memory_pressure = 1; + page_counter_charge(&memcg->tcp_mem.memory_allocated, nr_pages); + memcg->tcp_mem.memory_pressure = 1; return false; } /** * mem_cgroup_uncharge_skmem - uncharge socket memory - * @proto - proto to uncharge + * @memcg - memcg to uncharge * @nr_pages - number of pages to uncharge */ -void mem_cgroup_uncharge_skmem(struct cg_proto *proto, unsigned int nr_pages) +void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) { - page_counter_uncharge(&proto->memory_allocated, nr_pages); + page_counter_uncharge(&memcg->tcp_mem.memory_allocated, nr_pages); } #endif @@ -3653,7 +3640,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) if (ret) return ret; - return mem_cgroup_sockets_init(memcg, ss); + return tcp_init_cgroup(memcg, ss); } static void memcg_deactivate_kmem(struct mem_cgroup *memcg) @@ -3709,7 +3696,7 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg) static_key_slow_dec(&memcg_kmem_enabled_key); WARN_ON(page_counter_read(&memcg->kmem)); } - mem_cgroup_sockets_destroy(memcg); + tcp_destroy_cgroup(memcg); } #else static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) diff --git a/net/core/sock.c b/net/core/sock.c index 89ae859d2dc5..3535bffa45f3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -195,44 +195,6 @@ bool sk_net_capable(const struct sock *sk, int cap) } EXPORT_SYMBOL(sk_net_capable); - -#ifdef CONFIG_MEMCG_KMEM -int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss) -{ - struct proto *proto; - int ret = 0; - - mutex_lock(&proto_list_mutex); - list_for_each_entry(proto, &proto_list, node) { - if (proto->init_cgroup) { - ret = proto->init_cgroup(memcg, ss); - if (ret) - goto out; - } - } - - mutex_unlock(&proto_list_mutex); - return ret; -out: - list_for_each_entry_continue_reverse(proto, &proto_list, node) - if (proto->destroy_cgroup) - proto->destroy_cgroup(memcg); - mutex_unlock(&proto_list_mutex); - return ret; -} - -void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg) -{ - struct proto *proto; - - mutex_lock(&proto_list_mutex); - list_for_each_entry_reverse(proto, &proto_list, node) - if (proto->destroy_cgroup) - proto->destroy_cgroup(memcg); - mutex_unlock(&proto_list_mutex); -} -#endif - /* * Each address family might have different locking rules, so we have * one slock key per address family: @@ -1601,7 +1563,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sk_set_socket(newsk, NULL); newsk->sk_wq = NULL; - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) + if (mem_cgroup_sockets_enabled && sk->sk_memcg) sock_update_memcg(newsk); if (newsk->sk_prot->sockets_allocated) @@ -2089,8 +2051,8 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) allocated = sk_memory_allocated_add(sk, amt); - if (mem_cgroup_sockets_enabled && sk->sk_cgrp && - !mem_cgroup_charge_skmem(sk->sk_cgrp, amt)) + if (mem_cgroup_sockets_enabled && sk->sk_memcg && + !mem_cgroup_charge_skmem(sk->sk_memcg, amt)) goto suppress_allocation; /* Under limit. */ @@ -2153,8 +2115,8 @@ suppress_allocation: sk_memory_allocated_sub(sk, amt); - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) - mem_cgroup_uncharge_skmem(sk->sk_cgrp, amt); + if (mem_cgroup_sockets_enabled && sk->sk_memcg) + mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); return 0; } @@ -2171,8 +2133,8 @@ void __sk_mem_reclaim(struct sock *sk, int amount) sk_memory_allocated_sub(sk, amount); sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) - mem_cgroup_uncharge_skmem(sk->sk_cgrp, amount); + if (mem_cgroup_sockets_enabled && sk->sk_memcg) + mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); if (sk_under_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index eb39e02899e5..c7d1fb50f381 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1819,7 +1819,7 @@ void tcp_v4_destroy_sock(struct sock *sk) sk_sockets_allocated_dec(sk); - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) + if (mem_cgroup_sockets_enabled && sk->sk_memcg) sock_release_memcg(sk); } EXPORT_SYMBOL(tcp_v4_destroy_sock); @@ -2343,11 +2343,6 @@ struct proto tcp_prot = { #ifdef CONFIG_COMPAT .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, -#endif -#ifdef CONFIG_MEMCG_KMEM - .init_cgroup = tcp_init_cgroup, - .destroy_cgroup = tcp_destroy_cgroup, - .proto_cgroup = tcp_proto_cgroup, #endif .diag_destroy = tcp_abort, }; diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index ef4268d12e43..e5078259cbe3 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -8,60 +8,47 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { + struct mem_cgroup *parent = parent_mem_cgroup(memcg); + struct page_counter *counter_parent = NULL; /* * The root cgroup does not use page_counters, but rather, * rely on the data already collected by the network * subsystem */ - struct mem_cgroup *parent = parent_mem_cgroup(memcg); - struct page_counter *counter_parent = NULL; - struct cg_proto *cg_proto, *parent_cg; - - cg_proto = tcp_prot.proto_cgroup(memcg); - if (!cg_proto) + if (memcg == root_mem_cgroup) return 0; - cg_proto->memory_pressure = 0; - cg_proto->memcg = memcg; + memcg->tcp_mem.memory_pressure = 0; - parent_cg = tcp_prot.proto_cgroup(parent); - if (parent_cg) - counter_parent = &parent_cg->memory_allocated; + if (parent) + counter_parent = &parent->tcp_mem.memory_allocated; - page_counter_init(&cg_proto->memory_allocated, counter_parent); + page_counter_init(&memcg->tcp_mem.memory_allocated, counter_parent); return 0; } -EXPORT_SYMBOL(tcp_init_cgroup); void tcp_destroy_cgroup(struct mem_cgroup *memcg) { - struct cg_proto *cg_proto; - - cg_proto = tcp_prot.proto_cgroup(memcg); - if (!cg_proto) + if (memcg == root_mem_cgroup) return; - if (cg_proto->active) + if (memcg->tcp_mem.active) static_key_slow_dec(&memcg_socket_limit_enabled); - } -EXPORT_SYMBOL(tcp_destroy_cgroup); static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages) { - struct cg_proto *cg_proto; int ret; - cg_proto = tcp_prot.proto_cgroup(memcg); - if (!cg_proto) + if (memcg == root_mem_cgroup) return -EINVAL; - ret = page_counter_limit(&cg_proto->memory_allocated, nr_pages); + ret = page_counter_limit(&memcg->tcp_mem.memory_allocated, nr_pages); if (ret) return ret; - if (!cg_proto->active) { + if (!memcg->tcp_mem.active) { /* * The active flag needs to be written after the static_key * update. This is what guarantees that the socket activation @@ -79,7 +66,7 @@ static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages) * patched in yet. */ static_key_slow_inc(&memcg_socket_limit_enabled); - cg_proto->active = true; + memcg->tcp_mem.active = true; } return 0; @@ -123,32 +110,32 @@ static ssize_t tcp_cgroup_write(struct kernfs_open_file *of, static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct cg_proto *cg_proto = tcp_prot.proto_cgroup(memcg); u64 val; switch (cft->private) { case RES_LIMIT: - if (!cg_proto) - return PAGE_COUNTER_MAX; - val = cg_proto->memory_allocated.limit; + if (memcg == root_mem_cgroup) + val = PAGE_COUNTER_MAX; + else + val = memcg->tcp_mem.memory_allocated.limit; val *= PAGE_SIZE; break; case RES_USAGE: - if (!cg_proto) + if (memcg == root_mem_cgroup) val = atomic_long_read(&tcp_memory_allocated); else - val = page_counter_read(&cg_proto->memory_allocated); + val = page_counter_read(&memcg->tcp_mem.memory_allocated); val *= PAGE_SIZE; break; case RES_FAILCNT: - if (!cg_proto) + if (memcg == root_mem_cgroup) return 0; - val = cg_proto->memory_allocated.failcnt; + val = memcg->tcp_mem.memory_allocated.failcnt; break; case RES_MAX_USAGE: - if (!cg_proto) + if (memcg == root_mem_cgroup) return 0; - val = cg_proto->memory_allocated.watermark; + val = memcg->tcp_mem.memory_allocated.watermark; val *= PAGE_SIZE; break; default: @@ -161,19 +148,17 @@ static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg; - struct cg_proto *cg_proto; memcg = mem_cgroup_from_css(of_css(of)); - cg_proto = tcp_prot.proto_cgroup(memcg); - if (!cg_proto) + if (memcg == root_mem_cgroup) return nbytes; switch (of_cft(of)->private) { case RES_MAX_USAGE: - page_counter_reset_watermark(&cg_proto->memory_allocated); + page_counter_reset_watermark(&memcg->tcp_mem.memory_allocated); break; case RES_FAILCNT: - cg_proto->memory_allocated.failcnt = 0; + memcg->tcp_mem.memory_allocated.failcnt = 0; break; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 493b48945f0c..fda379cd600d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2821,8 +2821,8 @@ void sk_forced_mem_schedule(struct sock *sk, int size) sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; sk_memory_allocated_add(sk, amt); - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) - mem_cgroup_charge_skmem(sk->sk_cgrp, amt); + if (mem_cgroup_sockets_enabled && sk->sk_memcg) + mem_cgroup_charge_skmem(sk->sk_memcg, amt); } /* Send a FIN. The caller locks the socket for us. diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index db9f1c318afc..4ad8edb46f7c 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1888,9 +1888,6 @@ struct proto tcpv6_prot = { #ifdef CONFIG_COMPAT .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, -#endif -#ifdef CONFIG_MEMCG_KMEM - .proto_cgroup = tcp_proto_cgroup, #endif .clear_sk = tcp_v6_clear_sk, .diag_destroy = tcp_abort, -- cgit v1.2.3 From 80e95fe0fdcde2812c341ad4209d62dc1a7af53b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Jan 2016 15:21:20 -0800 Subject: mm: memcontrol: generalize the socket accounting jump label The unified hierarchy memory controller is going to use this jump label as well to control the networking callbacks. Move it to the memory controller code and give it a more generic name. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Reviewed-by: Vladimir Davydov Acked-by: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 3 +++ include/net/sock.h | 7 ------- mm/memcontrol.c | 3 +++ net/core/sock.c | 5 ----- net/ipv4/tcp_memcontrol.c | 4 ++-- 5 files changed, 8 insertions(+), 14 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7c085e4636ba..03090e8e7fff 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -681,11 +681,14 @@ void sock_release_memcg(struct sock *sk); bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages); void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages); #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) +extern struct static_key memcg_sockets_enabled_key; +#define mem_cgroup_sockets_enabled static_key_false(&memcg_sockets_enabled_key) static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { return memcg->tcp_mem.memory_pressure; } #else +#define mem_cgroup_sockets_enabled 0 static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { return false; diff --git a/include/net/sock.h b/include/net/sock.h index be96a8dcbc74..b9e7b3d863a0 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1079,13 +1079,6 @@ static inline void sk_refcnt_debug_release(const struct sock *sk) #define sk_refcnt_debug_release(sk) do { } while (0) #endif /* SOCK_REFCNT_DEBUG */ -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_NET) -extern struct static_key memcg_socket_limit_enabled; -#define mem_cgroup_sockets_enabled static_key_false(&memcg_socket_limit_enabled) -#else -#define mem_cgroup_sockets_enabled 0 -#endif - static inline bool sk_stream_memory_free(const struct sock *sk) { if (sk->sk_wmem_queued >= sk->sk_sndbuf) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index eaaa86126277..08ef3d2ca663 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -291,6 +291,9 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) /* Writing them here to avoid exposing memcg's inner layout */ #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) +struct static_key memcg_sockets_enabled_key; +EXPORT_SYMBOL(memcg_sockets_enabled_key); + void sock_update_memcg(struct sock *sk) { struct mem_cgroup *memcg; diff --git a/net/core/sock.c b/net/core/sock.c index 3535bffa45f3..6c1c8bc93412 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -202,11 +202,6 @@ EXPORT_SYMBOL(sk_net_capable); static struct lock_class_key af_family_keys[AF_MAX]; static struct lock_class_key af_family_slock_keys[AF_MAX]; -#if defined(CONFIG_MEMCG_KMEM) -struct static_key memcg_socket_limit_enabled; -EXPORT_SYMBOL(memcg_socket_limit_enabled); -#endif - /* * Make lock validator output more readable. (we pre-construct these * strings build-time, so that runtime initialization of socket diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index e5078259cbe3..9a22e2dfd64a 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -34,7 +34,7 @@ void tcp_destroy_cgroup(struct mem_cgroup *memcg) return; if (memcg->tcp_mem.active) - static_key_slow_dec(&memcg_socket_limit_enabled); + static_key_slow_dec(&memcg_sockets_enabled_key); } static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages) @@ -65,7 +65,7 @@ static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages) * because when this value change, the code to process it is not * patched in yet. */ - static_key_slow_inc(&memcg_socket_limit_enabled); + static_key_slow_inc(&memcg_sockets_enabled_key); memcg->tcp_mem.active = true; } -- cgit v1.2.3 From f7e1cb6ec51b041335b5ad4dd7aefb37a56d79a6 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Jan 2016 15:21:29 -0800 Subject: mm: memcontrol: account socket memory in unified hierarchy memory controller Socket memory can be a significant share of overall memory consumed by common workloads. In order to provide reasonable resource isolation in the unified hierarchy, this type of memory needs to be included in the tracking/accounting of a cgroup under active memory resource control. Overhead is only incurred when a non-root control group is created AND the memory controller is instructed to track and account the memory footprint of that group. cgroup.memory=nosocket can be specified on the boot commandline to override any runtime configuration and forcibly exclude socket memory from active memory resource control. Signed-off-by: Johannes Weiner Acked-by: David S. Miller Reviewed-by: Vladimir Davydov Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 4 ++ include/linux/memcontrol.h | 9 ++- mm/memcontrol.c | 122 +++++++++++++++++++++++++++++------- 3 files changed, 110 insertions(+), 25 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index b7d44871effc..168fd79dc697 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -608,6 +608,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. cut the overhead, others just disable the usage. So only cgroup_disable=memory is actually worthy} + cgroup.memory= [KNL] Pass options to the cgroup memory controller. + Format: + nosocket -- Disable socket memory accounting. + checkreqprot [SELINUX] Set initial checkreqprot flag value. Format: { "0" | "1" } See security/selinux/Kconfig help text. diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 03090e8e7fff..a355f61a2ed3 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -170,6 +170,9 @@ struct mem_cgroup { unsigned long low; unsigned long high; + /* Range enforcement for interrupt charges */ + struct work_struct high_work; + unsigned long soft_limit; /* vmpressure notifications */ @@ -680,12 +683,16 @@ void sock_update_memcg(struct sock *sk); void sock_release_memcg(struct sock *sk); bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages); void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages); -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) +#if defined(CONFIG_MEMCG) && defined(CONFIG_INET) extern struct static_key memcg_sockets_enabled_key; #define mem_cgroup_sockets_enabled static_key_false(&memcg_sockets_enabled_key) static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { +#ifdef CONFIG_MEMCG_KMEM return memcg->tcp_mem.memory_pressure; +#else + return false; +#endif } #else #define mem_cgroup_sockets_enabled 0 diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6aac8d2e31d7..60ebc486c2aa 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -80,6 +80,9 @@ struct mem_cgroup *root_mem_cgroup __read_mostly; #define MEM_CGROUP_RECLAIM_RETRIES 5 +/* Socket memory accounting disabled? */ +static bool cgroup_memory_nosocket; + /* Whether the swap controller is active */ #ifdef CONFIG_MEMCG_SWAP int do_swap_account __read_mostly; @@ -1945,6 +1948,26 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, return NOTIFY_OK; } +static void reclaim_high(struct mem_cgroup *memcg, + unsigned int nr_pages, + gfp_t gfp_mask) +{ + do { + if (page_counter_read(&memcg->memory) <= memcg->high) + continue; + mem_cgroup_events(memcg, MEMCG_HIGH, 1); + try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); + } while ((memcg = parent_mem_cgroup(memcg))); +} + +static void high_work_func(struct work_struct *work) +{ + struct mem_cgroup *memcg; + + memcg = container_of(work, struct mem_cgroup, high_work); + reclaim_high(memcg, CHARGE_BATCH, GFP_KERNEL); +} + /* * Scheduled by try_charge() to be executed from the userland return path * and reclaims memory over the high limit. @@ -1952,20 +1975,13 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, void mem_cgroup_handle_over_high(void) { unsigned int nr_pages = current->memcg_nr_pages_over_high; - struct mem_cgroup *memcg, *pos; + struct mem_cgroup *memcg; if (likely(!nr_pages)) return; - pos = memcg = get_mem_cgroup_from_mm(current->mm); - - do { - if (page_counter_read(&pos->memory) <= pos->high) - continue; - mem_cgroup_events(pos, MEMCG_HIGH, 1); - try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true); - } while ((pos = parent_mem_cgroup(pos))); - + memcg = get_mem_cgroup_from_mm(current->mm); + reclaim_high(memcg, nr_pages, GFP_KERNEL); css_put(&memcg->css); current->memcg_nr_pages_over_high = 0; } @@ -2100,6 +2116,11 @@ done_restock: */ do { if (page_counter_read(&memcg->memory) > memcg->high) { + /* Don't bother a random interrupted task */ + if (in_interrupt()) { + schedule_work(&memcg->high_work); + break; + } current->memcg_nr_pages_over_high += batch; set_notify_resume(current); break; @@ -4150,6 +4171,8 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) { int node; + cancel_work_sync(&memcg->high_work); + mem_cgroup_remove_from_trees(memcg); for_each_node(node) @@ -4196,6 +4219,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) page_counter_init(&memcg->kmem, NULL); } + INIT_WORK(&memcg->high_work, high_work_func); memcg->last_scanned_node = MAX_NUMNODES; INIT_LIST_HEAD(&memcg->oom_notify); memcg->move_charge_at_immigrate = 0; @@ -4267,6 +4291,11 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) if (ret) return ret; +#ifdef CONFIG_INET + if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) + static_key_slow_inc(&memcg_sockets_enabled_key); +#endif + /* * Make sure the memcg is initialized: mem_cgroup_iter() * orders reading memcg->initialized against its callers @@ -4313,6 +4342,10 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) struct mem_cgroup *memcg = mem_cgroup_from_css(css); memcg_destroy_kmem(memcg); +#ifdef CONFIG_INET + if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) + static_key_slow_dec(&memcg_sockets_enabled_key); +#endif __mem_cgroup_free(memcg); } @@ -5533,8 +5566,7 @@ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) commit_charge(newpage, memcg, true); } -/* Writing them here to avoid exposing memcg's inner layout */ -#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) +#ifdef CONFIG_INET struct static_key memcg_sockets_enabled_key; EXPORT_SYMBOL(memcg_sockets_enabled_key); @@ -5559,10 +5591,15 @@ void sock_update_memcg(struct sock *sk) rcu_read_lock(); memcg = mem_cgroup_from_task(current); - if (memcg != root_mem_cgroup && - memcg->tcp_mem.active && - css_tryget_online(&memcg->css)) + if (memcg == root_mem_cgroup) + goto out; +#ifdef CONFIG_MEMCG_KMEM + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcp_mem.active) + goto out; +#endif + if (css_tryget_online(&memcg->css)) sk->sk_memcg = memcg; +out: rcu_read_unlock(); } EXPORT_SYMBOL(sock_update_memcg); @@ -5583,15 +5620,30 @@ void sock_release_memcg(struct sock *sk) */ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) { - struct page_counter *counter; + gfp_t gfp_mask = GFP_KERNEL; - if (page_counter_try_charge(&memcg->tcp_mem.memory_allocated, - nr_pages, &counter)) { - memcg->tcp_mem.memory_pressure = 0; - return true; +#ifdef CONFIG_MEMCG_KMEM + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { + struct page_counter *counter; + + if (page_counter_try_charge(&memcg->tcp_mem.memory_allocated, + nr_pages, &counter)) { + memcg->tcp_mem.memory_pressure = 0; + return true; + } + page_counter_charge(&memcg->tcp_mem.memory_allocated, nr_pages); + memcg->tcp_mem.memory_pressure = 1; + return false; } - page_counter_charge(&memcg->tcp_mem.memory_allocated, nr_pages); - memcg->tcp_mem.memory_pressure = 1; +#endif + /* Don't block in the packet receive path */ + if (in_softirq()) + gfp_mask = GFP_NOWAIT; + + if (try_charge(memcg, gfp_mask, nr_pages) == 0) + return true; + + try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages); return false; } @@ -5602,10 +5654,32 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) */ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) { - page_counter_uncharge(&memcg->tcp_mem.memory_allocated, nr_pages); +#ifdef CONFIG_MEMCG_KMEM + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { + page_counter_uncharge(&memcg->tcp_mem.memory_allocated, + nr_pages); + return; + } +#endif + page_counter_uncharge(&memcg->memory, nr_pages); + css_put_many(&memcg->css, nr_pages); } -#endif +#endif /* CONFIG_INET */ + +static int __init cgroup_memory(char *s) +{ + char *token; + + while ((token = strsep(&s, ",")) != NULL) { + if (!*token) + continue; + if (!strcmp(token, "nosocket")) + cgroup_memory_nosocket = true; + } + return 0; +} +__setup("cgroup.memory=", cgroup_memory); /* * subsys_initcall() for memory controller. -- cgit v1.2.3 From 8e8ae645249b85c8ed6c178557f8db8613a6bcc7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Jan 2016 15:21:32 -0800 Subject: mm: memcontrol: hook up vmpressure to socket pressure Let the networking stack know when a memcg is under reclaim pressure so that it can clamp its transmit windows accordingly. Whenever the reclaim efficiency of a cgroup's LRU lists drops low enough for a MEDIUM or HIGH vmpressure event to occur, assert a pressure state in the socket and tcp memory code that tells it to curb consumption growth from sockets associated with said control group. Traditionally, vmpressure reports for the entire subtree of a memcg under pressure, which drops useful information on the individual groups reclaimed. However, it's too late to change the userinterface, so add a second reporting mode that reports on the level of reclaim instead of at the level of pressure, and use that report for sockets. vmpressure events are naturally edge triggered, so for hysteresis assert socket pressure for a second to allow for subsequent vmpressure events to occur before letting the socket code return to normal. This will likely need finetuning for a wider variety of workloads, but for now stick to the vmpressure presets and keep hysteresis simple. Signed-off-by: Johannes Weiner Acked-by: David S. Miller Reviewed-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 32 ++++++++++++++++--- include/linux/vmpressure.h | 7 +++-- mm/memcontrol.c | 17 ++-------- mm/vmpressure.c | 78 +++++++++++++++++++++++++++++++++++----------- mm/vmscan.c | 10 +++++- 5 files changed, 104 insertions(+), 40 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a355f61a2ed3..c5a51039df57 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -249,6 +249,10 @@ struct mem_cgroup { struct wb_domain cgwb_domain; #endif +#ifdef CONFIG_INET + unsigned long socket_pressure; +#endif + /* List of events which userspace want to receive */ struct list_head event_list; spinlock_t event_list_lock; @@ -290,18 +294,34 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *); bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); -struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ return css ? container_of(css, struct mem_cgroup, css) : NULL; } +#define mem_cgroup_from_counter(counter, member) \ + container_of(counter, struct mem_cgroup, member) + struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, struct mem_cgroup *, struct mem_cgroup_reclaim_cookie *); void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); +/** + * parent_mem_cgroup - find the accounting parent of a memcg + * @memcg: memcg whose parent to find + * + * Returns the parent memcg, or NULL if this is the root or the memory + * controller is in legacy no-hierarchy mode. + */ +static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) +{ + if (!memcg->memory.parent) + return NULL; + return mem_cgroup_from_counter(memcg->memory.parent, memory); +} + static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root) { @@ -689,10 +709,14 @@ extern struct static_key memcg_sockets_enabled_key; static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { #ifdef CONFIG_MEMCG_KMEM - return memcg->tcp_mem.memory_pressure; -#else - return false; + if (memcg->tcp_mem.memory_pressure) + return true; #endif + do { + if (time_before(jiffies, memcg->socket_pressure)) + return true; + } while ((memcg = parent_mem_cgroup(memcg))); + return false; } #else #define mem_cgroup_sockets_enabled 0 diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h index 3e4535876d37..3347cc3ec0ab 100644 --- a/include/linux/vmpressure.h +++ b/include/linux/vmpressure.h @@ -12,6 +12,9 @@ struct vmpressure { unsigned long scanned; unsigned long reclaimed; + + unsigned long tree_scanned; + unsigned long tree_reclaimed; /* The lock is used to keep the scanned/reclaimed above in sync. */ struct spinlock sr_lock; @@ -26,7 +29,7 @@ struct vmpressure { struct mem_cgroup; #ifdef CONFIG_MEMCG -extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, +extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, unsigned long scanned, unsigned long reclaimed); extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio); @@ -40,7 +43,7 @@ extern int vmpressure_register_event(struct mem_cgroup *memcg, extern void vmpressure_unregister_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd); #else -static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, +static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, unsigned long scanned, unsigned long reclaimed) {} static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) {} diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 60ebc486c2aa..df7f144a5a4b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1113,9 +1113,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) return ret; } -#define mem_cgroup_from_counter(counter, member) \ - container_of(counter, struct mem_cgroup, member) - /** * mem_cgroup_margin - calculate chargeable space of a memory cgroup * @memcg: the memory cgroup @@ -4183,17 +4180,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) kfree(memcg); } -/* - * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. - */ -struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) -{ - if (!memcg->memory.parent) - return NULL; - return mem_cgroup_from_counter(memcg->memory.parent, memory); -} -EXPORT_SYMBOL(parent_mem_cgroup); - static struct cgroup_subsys_state * __ref mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -4233,6 +4219,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) #endif #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&memcg->cgwb_list); +#endif +#ifdef CONFIG_INET + memcg->socket_pressure = jiffies; #endif return &memcg->css; diff --git a/mm/vmpressure.c b/mm/vmpressure.c index c5afd573d7da..506f03e4be47 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -137,14 +137,11 @@ struct vmpressure_event { }; static bool vmpressure_event(struct vmpressure *vmpr, - unsigned long scanned, unsigned long reclaimed) + enum vmpressure_levels level) { struct vmpressure_event *ev; - enum vmpressure_levels level; bool signalled = false; - level = vmpressure_calc_level(scanned, reclaimed); - mutex_lock(&vmpr->events_lock); list_for_each_entry(ev, &vmpr->events, node) { @@ -164,6 +161,7 @@ static void vmpressure_work_fn(struct work_struct *work) struct vmpressure *vmpr = work_to_vmpressure(work); unsigned long scanned; unsigned long reclaimed; + enum vmpressure_levels level; spin_lock(&vmpr->sr_lock); /* @@ -174,19 +172,21 @@ static void vmpressure_work_fn(struct work_struct *work) * here. No need for any locks here since we don't care if * vmpr->reclaimed is in sync. */ - scanned = vmpr->scanned; + scanned = vmpr->tree_scanned; if (!scanned) { spin_unlock(&vmpr->sr_lock); return; } - reclaimed = vmpr->reclaimed; - vmpr->scanned = 0; - vmpr->reclaimed = 0; + reclaimed = vmpr->tree_reclaimed; + vmpr->tree_scanned = 0; + vmpr->tree_reclaimed = 0; spin_unlock(&vmpr->sr_lock); + level = vmpressure_calc_level(scanned, reclaimed); + do { - if (vmpressure_event(vmpr, scanned, reclaimed)) + if (vmpressure_event(vmpr, level)) break; /* * If not handled, propagate the event upward into the @@ -199,6 +199,7 @@ static void vmpressure_work_fn(struct work_struct *work) * vmpressure() - Account memory pressure through scanned/reclaimed ratio * @gfp: reclaimer's gfp mask * @memcg: cgroup memory controller handle + * @tree: legacy subtree mode * @scanned: number of pages scanned * @reclaimed: number of pages reclaimed * @@ -206,9 +207,16 @@ static void vmpressure_work_fn(struct work_struct *work) * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw * pressure index is then further refined and averaged over time. * + * If @tree is set, vmpressure is in traditional userspace reporting + * mode: @memcg is considered the pressure root and userspace is + * notified of the entire subtree's reclaim efficiency. + * + * If @tree is not set, reclaim efficiency is recorded for @memcg, and + * only in-kernel users are notified. + * * This function does not return any value. */ -void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, +void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, unsigned long scanned, unsigned long reclaimed) { struct vmpressure *vmpr = memcg_to_vmpressure(memcg); @@ -238,15 +246,47 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, if (!scanned) return; - spin_lock(&vmpr->sr_lock); - vmpr->scanned += scanned; - vmpr->reclaimed += reclaimed; - scanned = vmpr->scanned; - spin_unlock(&vmpr->sr_lock); + if (tree) { + spin_lock(&vmpr->sr_lock); + vmpr->tree_scanned += scanned; + vmpr->tree_reclaimed += reclaimed; + scanned = vmpr->scanned; + spin_unlock(&vmpr->sr_lock); - if (scanned < vmpressure_win) - return; - schedule_work(&vmpr->work); + if (scanned < vmpressure_win) + return; + schedule_work(&vmpr->work); + } else { + enum vmpressure_levels level; + + /* For now, no users for root-level efficiency */ + if (memcg == root_mem_cgroup) + return; + + spin_lock(&vmpr->sr_lock); + scanned = vmpr->scanned += scanned; + reclaimed = vmpr->reclaimed += reclaimed; + if (scanned < vmpressure_win) { + spin_unlock(&vmpr->sr_lock); + return; + } + vmpr->scanned = vmpr->reclaimed = 0; + spin_unlock(&vmpr->sr_lock); + + level = vmpressure_calc_level(scanned, reclaimed); + + if (level > VMPRESSURE_LOW) { + /* + * Let the socket buffer allocator know that + * we are having trouble reclaiming LRU pages. + * + * For hysteresis keep the pressure state + * asserted for a second in which subsequent + * pressure events can occur. + */ + memcg->socket_pressure = jiffies + HZ; + } + } } /** @@ -276,7 +316,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) * to the vmpressure() basically means that we signal 'critical' * level. */ - vmpressure(gfp, memcg, vmpressure_win, 0); + vmpressure(gfp, memcg, true, vmpressure_win, 0); } /** diff --git a/mm/vmscan.c b/mm/vmscan.c index e36d766dade9..bb0cbd4c9f01 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2396,6 +2396,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, memcg = mem_cgroup_iter(root, NULL, &reclaim); do { unsigned long lru_pages; + unsigned long reclaimed; unsigned long scanned; struct lruvec *lruvec; int swappiness; @@ -2408,6 +2409,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, lruvec = mem_cgroup_zone_lruvec(zone, memcg); swappiness = mem_cgroup_swappiness(memcg); + reclaimed = sc->nr_reclaimed; scanned = sc->nr_scanned; shrink_lruvec(lruvec, swappiness, sc, &lru_pages); @@ -2418,6 +2420,11 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, memcg, sc->nr_scanned - scanned, lru_pages); + /* Record the group's reclaim efficiency */ + vmpressure(sc->gfp_mask, memcg, false, + sc->nr_scanned - scanned, + sc->nr_reclaimed - reclaimed); + /* * Direct reclaim and kswapd have to scan all memory * cgroups to fulfill the overall scan target for the @@ -2449,7 +2456,8 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, reclaim_state->reclaimed_slab = 0; } - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, + /* Record the subtree's reclaim efficiency */ + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, sc->nr_scanned - nr_scanned, sc->nr_reclaimed - nr_reclaimed); -- cgit v1.2.3 From ef12947c9c5a96af549c49f10e5503f0612a397c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Jan 2016 15:21:34 -0800 Subject: mm: memcontrol: switch to the updated jump-label API According to the direct use of struct static_key is deprecated. Update the socket and slab accounting code accordingly. Signed-off-by: Johannes Weiner Acked-by: David S. Miller Reported-by: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 8 ++++---- mm/memcontrol.c | 12 ++++++------ net/ipv4/tcp_memcontrol.c | 4 ++-- 3 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index c5a51039df57..2292468f2a30 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -704,8 +704,8 @@ void sock_release_memcg(struct sock *sk); bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages); void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages); #if defined(CONFIG_MEMCG) && defined(CONFIG_INET) -extern struct static_key memcg_sockets_enabled_key; -#define mem_cgroup_sockets_enabled static_key_false(&memcg_sockets_enabled_key) +extern struct static_key_false memcg_sockets_enabled_key; +#define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key) static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { #ifdef CONFIG_MEMCG_KMEM @@ -727,7 +727,7 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) #endif #ifdef CONFIG_MEMCG_KMEM -extern struct static_key memcg_kmem_enabled_key; +extern struct static_key_false memcg_kmem_enabled_key; extern int memcg_nr_cache_ids; void memcg_get_cache_ids(void); @@ -743,7 +743,7 @@ void memcg_put_cache_ids(void); static inline bool memcg_kmem_enabled(void) { - return static_key_false(&memcg_kmem_enabled_key); + return static_branch_unlikely(&memcg_kmem_enabled_key); } static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index df7f144a5a4b..54eae4f19d80 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -346,7 +346,7 @@ void memcg_put_cache_ids(void) * conditional to this static branch, we'll have to allow modules that does * kmem_cache_alloc and the such to see this symbol as well */ -struct static_key memcg_kmem_enabled_key; +DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); EXPORT_SYMBOL(memcg_kmem_enabled_key); #endif /* CONFIG_MEMCG_KMEM */ @@ -2907,7 +2907,7 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg, err = page_counter_limit(&memcg->kmem, nr_pages); VM_BUG_ON(err); - static_key_slow_inc(&memcg_kmem_enabled_key); + static_branch_inc(&memcg_kmem_enabled_key); /* * A memory cgroup is considered kmem-active as soon as it gets * kmemcg_id. Setting the id after enabling static branching will @@ -3646,7 +3646,7 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg) { if (memcg->kmem_acct_activated) { memcg_destroy_kmem_caches(memcg); - static_key_slow_dec(&memcg_kmem_enabled_key); + static_branch_dec(&memcg_kmem_enabled_key); WARN_ON(page_counter_read(&memcg->kmem)); } tcp_destroy_cgroup(memcg); @@ -4282,7 +4282,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) #ifdef CONFIG_INET if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) - static_key_slow_inc(&memcg_sockets_enabled_key); + static_branch_inc(&memcg_sockets_enabled_key); #endif /* @@ -4333,7 +4333,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) memcg_destroy_kmem(memcg); #ifdef CONFIG_INET if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) - static_key_slow_dec(&memcg_sockets_enabled_key); + static_branch_dec(&memcg_sockets_enabled_key); #endif __mem_cgroup_free(memcg); } @@ -5557,7 +5557,7 @@ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) #ifdef CONFIG_INET -struct static_key memcg_sockets_enabled_key; +DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); EXPORT_SYMBOL(memcg_sockets_enabled_key); void sock_update_memcg(struct sock *sk) diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 9a22e2dfd64a..18bc7f745e9c 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -34,7 +34,7 @@ void tcp_destroy_cgroup(struct mem_cgroup *memcg) return; if (memcg->tcp_mem.active) - static_key_slow_dec(&memcg_sockets_enabled_key); + static_branch_dec(&memcg_sockets_enabled_key); } static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages) @@ -65,7 +65,7 @@ static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages) * because when this value change, the code to process it is not * patched in yet. */ - static_key_slow_inc(&memcg_sockets_enabled_key); + static_branch_inc(&memcg_sockets_enabled_key); memcg->tcp_mem.active = true; } -- cgit v1.2.3