From 5d097056c9a017a3b720849efb5432f37acabbac Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 14 Jan 2016 15:18:21 -0800 Subject: kmemcg: account certain kmem allocations to memcg Mark those kmem allocations that are known to be easily triggered from userspace as __GFP_ACCOUNT/SLAB_ACCOUNT, which makes them accounted to memcg. For the list, see below: - threadinfo - task_struct - task_delay_info - pid - cred - mm_struct - vm_area_struct and vm_region (nommu) - anon_vma and anon_vma_chain - signal_struct - sighand_struct - fs_struct - files_struct - fdtable and fdtable->full_fds_bits - dentry and external_name - inode for all filesystems. This is the most tedious part, because most filesystems overwrite the alloc_inode method. The list is far from complete, so feel free to add more objects. Nevertheless, it should be close to "account everything" approach and keep most workloads within bounds. Malevolent users will be able to breach the limit, but this was possible even with the former "account everything" approach (simply because it did not account everything in fact). [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Cc: Greg Thelen Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/shmem.c') diff --git a/mm/shmem.c b/mm/shmem.c index 5813b7fa85b6..9e60093aca3f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3064,7 +3064,7 @@ static int shmem_init_inodecache(void) { shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", sizeof(struct shmem_inode_info), - 0, SLAB_PANIC, shmem_init_inode); + 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); return 0; } -- cgit v1.2.3 From 6a15a37097c7e02390bb08d83dac433c9f10144f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 14 Jan 2016 15:19:20 -0800 Subject: mm, proc: reduce cost of /proc/pid/smaps for shmem mappings The previous patch has improved swap accounting for shmem mapping, which however made /proc/pid/smaps more expensive for shmem mappings, as we consult the radix tree for each pte_none entry, so the overal complexity is O(n*log(n)). We can reduce this significantly for mappings that cannot contain COWed pages, because then we can either use the statistics tha shmem object itself tracks (if the mapping contains the whole object, or the swap usage of the whole object is zero), or use the radix tree iterator, which is much more effective than repeated find_get_entry() calls. This patch therefore introduces a function shmem_swap_usage(vma) and makes /proc/pid/smaps use it when possible. Only for writable private mappings of shmem objects (i.e. tmpfs files) with the shmem object itself (partially) swapped outwe have to resort to the find_get_entry() approach. Hopefully such mappings are relatively uncommon. To demonstrate the diference, I have measured this on a process that creates a 2GB mapping and dirties single pages with a stride of 2MB, and time how long does it take to cat /proc/pid/smaps of this process 100 times. Private writable mapping of a /dev/shm/file (the most complex case): real 0m3.831s user 0m0.180s sys 0m3.212s Shared mapping of an almost full mapping of a partially swapped /dev/shm/file (which needs to employ the radix tree iterator). real 0m1.351s user 0m0.096s sys 0m0.768s Same, but with /dev/shm/file not swapped (so no radix tree walk needed) real 0m0.935s user 0m0.128s sys 0m0.344s Private anonymous mapping: real 0m0.949s user 0m0.116s sys 0m0.348s The cost is now much closer to the private anonymous mapping case, unless the shmem mapping is private and writable. Signed-off-by: Vlastimil Babka Cc: Hugh Dickins Cc: Jerome Marchand Cc: Konstantin Khlebnikov Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 22 +++++++++++++-- include/linux/shmem_fs.h | 2 ++ mm/shmem.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 92 insertions(+), 2 deletions(-) (limited to 'mm/shmem.c') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 85ef60fdf2c0..5830b2e129ed 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -717,8 +718,25 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) #ifdef CONFIG_SHMEM if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { - mss.check_shmem_swap = true; - smaps_walk.pte_hole = smaps_pte_hole; + /* + * For shared or readonly shmem mappings we know that all + * swapped out pages belong to the shmem object, and we can + * obtain the swap value much more efficiently. For private + * writable mappings, we might have COW pages that are + * not affected by the parent swapped out pages of the shmem + * object, so we have to distinguish them during the page walk. + * Unless we know that the shmem object (or the part mapped by + * our VMA) has no swapped out pages at all. + */ + unsigned long shmem_swapped = shmem_swap_usage(vma); + + if (!shmem_swapped || (vma->vm_flags & VM_SHARED) || + !(vma->vm_flags & VM_WRITE)) { + mss.swap = shmem_swapped; + } else { + mss.check_shmem_swap = true; + smaps_walk.pte_hole = smaps_pte_hole; + } } #endif diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 50777b5b1e4c..bd58be5e7a2a 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -60,6 +60,8 @@ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); extern int shmem_unuse(swp_entry_t entry, struct page *page); +extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); + static inline struct page *shmem_read_mapping_page( struct address_space *mapping, pgoff_t index) { diff --git a/mm/shmem.c b/mm/shmem.c index 9e60093aca3f..e978621de1ef 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -359,6 +359,76 @@ static int shmem_free_swap(struct address_space *mapping, return 0; } +/* + * Determine (in bytes) how many of the shmem object's pages mapped by the + * given vma is swapped out. + * + * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU, + * as long as the inode doesn't go away and racy results are not a problem. + */ +unsigned long shmem_swap_usage(struct vm_area_struct *vma) +{ + struct inode *inode = file_inode(vma->vm_file); + struct shmem_inode_info *info = SHMEM_I(inode); + struct address_space *mapping = inode->i_mapping; + unsigned long swapped; + pgoff_t start, end; + struct radix_tree_iter iter; + void **slot; + struct page *page; + + /* Be careful as we don't hold info->lock */ + swapped = READ_ONCE(info->swapped); + + /* + * The easier cases are when the shmem object has nothing in swap, or + * the vma maps it whole. Then we can simply use the stats that we + * already track. + */ + if (!swapped) + return 0; + + if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size) + return swapped << PAGE_SHIFT; + + swapped = 0; + + /* Here comes the more involved part */ + start = linear_page_index(vma, vma->vm_start); + end = linear_page_index(vma, vma->vm_end); + + rcu_read_lock(); + +restart: + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + if (iter.index >= end) + break; + + page = radix_tree_deref_slot(slot); + + /* + * This should only be possible to happen at index 0, so we + * don't need to reset the counter, nor do we risk infinite + * restarts. + */ + if (radix_tree_deref_retry(page)) + goto restart; + + if (radix_tree_exceptional_entry(page)) + swapped++; + + if (need_resched()) { + cond_resched_rcu(); + start = iter.index + 1; + goto restart; + } + } + + rcu_read_unlock(); + + return swapped << PAGE_SHIFT; +} + /* * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists. */ -- cgit v1.2.3 From 48131e03ca4ed71d73fbe55c311a258c6fa2a090 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 14 Jan 2016 15:19:23 -0800 Subject: mm, proc: reduce cost of /proc/pid/smaps for unpopulated shmem mappings Following the previous patch, further reduction of /proc/pid/smaps cost is possible for private writable shmem mappings with unpopulated areas where the page walk invokes the .pte_hole function. We can use radix tree iterator for each such area instead of calling find_get_entry() in a loop. This is possible at the extra maintenance cost of introducing another shmem function shmem_partial_swap_usage(). To demonstrate the diference, I have measured this on a process that creates a private writable 2GB mapping of a partially swapped out /dev/shm/file (which cannot employ the optimizations from the prvious patch) and doesn't populate it at all. I time how long does it take to cat /proc/pid/smaps of this process 100 times. Before this patch: real 0m3.831s user 0m0.180s sys 0m3.212s After this patch: real 0m1.176s user 0m0.180s sys 0m0.684s The time is similar to the case where a radix tree iterator is employed on the whole mapping. Signed-off-by: Vlastimil Babka Cc: Hugh Dickins Cc: Jerome Marchand Cc: Konstantin Khlebnikov Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 42 ++++++++++--------------------- include/linux/shmem_fs.h | 2 ++ mm/shmem.c | 65 ++++++++++++++++++++++++++++-------------------- 3 files changed, 53 insertions(+), 56 deletions(-) (limited to 'mm/shmem.c') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 5830b2e129ed..8a03759bda38 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -488,42 +488,16 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, } #ifdef CONFIG_SHMEM -static unsigned long smaps_shmem_swap(struct vm_area_struct *vma, - unsigned long addr) -{ - struct page *page; - - page = find_get_entry(vma->vm_file->f_mapping, - linear_page_index(vma, addr)); - if (!page) - return 0; - - if (radix_tree_exceptional_entry(page)) - return PAGE_SIZE; - - page_cache_release(page); - return 0; - -} - static int smaps_pte_hole(unsigned long addr, unsigned long end, struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; - while (addr < end) { - mss->swap += smaps_shmem_swap(walk->vma, addr); - addr += PAGE_SIZE; - } + mss->swap += shmem_partial_swap_usage( + walk->vma->vm_file->f_mapping, addr, end); return 0; } -#else -static unsigned long smaps_shmem_swap(struct vm_area_struct *vma, - unsigned long addr) -{ - return 0; -} #endif static void smaps_pte_entry(pte_t *pte, unsigned long addr, @@ -555,7 +529,17 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, page = migration_entry_to_page(swpent); } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap && pte_none(*pte))) { - mss->swap += smaps_shmem_swap(vma, addr); + page = find_get_entry(vma->vm_file->f_mapping, + linear_page_index(vma, addr)); + if (!page) + return; + + if (radix_tree_exceptional_entry(page)) + mss->swap += PAGE_SIZE; + else + page_cache_release(page); + + return; } if (!page) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index bd58be5e7a2a..a43f41cb3c43 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -61,6 +61,8 @@ extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); extern int shmem_unuse(swp_entry_t entry, struct page *page); extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); +extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, + pgoff_t start, pgoff_t end); static inline struct page *shmem_read_mapping_page( struct address_space *mapping, pgoff_t index) diff --git a/mm/shmem.c b/mm/shmem.c index e978621de1ef..760d90cf2a41 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -361,41 +361,18 @@ static int shmem_free_swap(struct address_space *mapping, /* * Determine (in bytes) how many of the shmem object's pages mapped by the - * given vma is swapped out. + * given offsets are swapped out. * * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem. */ -unsigned long shmem_swap_usage(struct vm_area_struct *vma) +unsigned long shmem_partial_swap_usage(struct address_space *mapping, + pgoff_t start, pgoff_t end) { - struct inode *inode = file_inode(vma->vm_file); - struct shmem_inode_info *info = SHMEM_I(inode); - struct address_space *mapping = inode->i_mapping; - unsigned long swapped; - pgoff_t start, end; struct radix_tree_iter iter; void **slot; struct page *page; - - /* Be careful as we don't hold info->lock */ - swapped = READ_ONCE(info->swapped); - - /* - * The easier cases are when the shmem object has nothing in swap, or - * the vma maps it whole. Then we can simply use the stats that we - * already track. - */ - if (!swapped) - return 0; - - if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size) - return swapped << PAGE_SHIFT; - - swapped = 0; - - /* Here comes the more involved part */ - start = linear_page_index(vma, vma->vm_start); - end = linear_page_index(vma, vma->vm_end); + unsigned long swapped = 0; rcu_read_lock(); @@ -429,6 +406,40 @@ restart: return swapped << PAGE_SHIFT; } +/* + * Determine (in bytes) how many of the shmem object's pages mapped by the + * given vma is swapped out. + * + * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU, + * as long as the inode doesn't go away and racy results are not a problem. + */ +unsigned long shmem_swap_usage(struct vm_area_struct *vma) +{ + struct inode *inode = file_inode(vma->vm_file); + struct shmem_inode_info *info = SHMEM_I(inode); + struct address_space *mapping = inode->i_mapping; + unsigned long swapped; + + /* Be careful as we don't hold info->lock */ + swapped = READ_ONCE(info->swapped); + + /* + * The easier cases are when the shmem object has nothing in swap, or + * the vma maps it whole. Then we can simply use the stats that we + * already track. + */ + if (!swapped) + return 0; + + if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size) + return swapped << PAGE_SHIFT; + + /* Here comes the more involved part */ + return shmem_partial_swap_usage(mapping, + linear_page_index(vma, vma->vm_start), + linear_page_index(vma, vma->vm_end)); +} + /* * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists. */ -- cgit v1.2.3