From f62e00cc3a00bfbd394a79fc22b334c31f91bd5f Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 24 May 2011 17:11:29 -0700 Subject: mm: introduce wait_on_page_locked_killable() commit 2687a356 ("Add lock_page_killable") introduced killable lock_page(). Similarly this patch introdues killable wait_on_page_locked(). Signed-off-by: KOSAKI Motohiro Acked-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim Cc: Matthew Wilcox Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index c641edf553a9..dea8a38bb2bb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -562,6 +562,17 @@ void wait_on_page_bit(struct page *page, int bit_nr) } EXPORT_SYMBOL(wait_on_page_bit); +int wait_on_page_bit_killable(struct page *page, int bit_nr) +{ + DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); + + if (!test_bit(bit_nr, &page->flags)) + return 0; + + return __wait_on_bit(page_waitqueue(page), &wait, + sleep_on_page_killable, TASK_KILLABLE); +} + /** * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue * @page: Page defining the wait queue of interest -- cgit v1.2.3 From 37b23e0525d393d48a7d59f870b3bc061a30ccdb Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 24 May 2011 17:11:30 -0700 Subject: x86,mm: make pagefault killable When an oom killing occurs, almost all processes are getting stuck at the following two points. 1) __alloc_pages_nodemask 2) __lock_page_or_retry 1) is not very problematic because TIF_MEMDIE leads to an allocation failure and getting out from page allocator. 2) is more problematic. In an OOM situation, zones typically don't have page cache at all and memory starvation might lead to greatly reduced IO performance. When a fork bomb occurs, TIF_MEMDIE tasks don't die quickly, meaning that a fork bomb may create new process quickly rather than the oom-killer killing it. Then, the system may become livelocked. This patch makes the pagefault interruptible by SIGKILL. Signed-off-by: KOSAKI Motohiro Reviewed-by: KAMEZAWA Hiroyuki Cc: Minchan Kim Cc: Matthew Wilcox Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/fault.c | 12 +++++++++++- include/linux/mm.h | 1 + mm/filemap.c | 31 ++++++++++++++++++++++++------- 3 files changed, 36 insertions(+), 8 deletions(-) (limited to 'mm/filemap.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index bcb394dfbb35..f7a2a054a3c0 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -965,7 +965,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) struct mm_struct *mm; int fault; int write = error_code & PF_WRITE; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | (write ? FAULT_FLAG_WRITE : 0); tsk = current; @@ -1138,6 +1138,16 @@ good_area: return; } + /* + * Pagefault was interrupted by SIGKILL. We have no reason to + * continue pagefault. + */ + if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) { + if (!(error_code & PF_USER)) + no_context(regs, error_code, address); + return; + } + /* * Major/minor page fault accounting is only done on the * initial attempt. If we go through a retry, it is extremely diff --git a/include/linux/mm.h b/include/linux/mm.h index 1746f67c33de..57d3d5fade16 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -153,6 +153,7 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_MKWRITE 0x04 /* Fault was mkwrite of existing pte */ #define FAULT_FLAG_ALLOW_RETRY 0x08 /* Retry fault if blocking */ #define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */ +#define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */ /* * This interface is used by x86 PAT code to identify a pfn mapping that is diff --git a/mm/filemap.c b/mm/filemap.c index dea8a38bb2bb..8144f87dcbb4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -654,15 +654,32 @@ EXPORT_SYMBOL_GPL(__lock_page_killable); int __lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags) { - if (!(flags & FAULT_FLAG_ALLOW_RETRY)) { - __lock_page(page); - return 1; - } else { - if (!(flags & FAULT_FLAG_RETRY_NOWAIT)) { - up_read(&mm->mmap_sem); + if (flags & FAULT_FLAG_ALLOW_RETRY) { + /* + * CAUTION! In this case, mmap_sem is not released + * even though return 0. + */ + if (flags & FAULT_FLAG_RETRY_NOWAIT) + return 0; + + up_read(&mm->mmap_sem); + if (flags & FAULT_FLAG_KILLABLE) + wait_on_page_locked_killable(page); + else wait_on_page_locked(page); - } return 0; + } else { + if (flags & FAULT_FLAG_KILLABLE) { + int ret; + + ret = __lock_page_killable(page); + if (ret) { + up_read(&mm->mmap_sem); + return 0; + } + } else + __lock_page(page); + return 1; } } -- cgit v1.2.3 From 3d48ae45e72390ddf8cc5256ac32ed6f7a19cbea Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 May 2011 17:12:06 -0700 Subject: mm: Convert i_mmap_lock to a mutex Straightforward conversion of i_mmap_lock to a mutex. Signed-off-by: Peter Zijlstra Acked-by: Hugh Dickins Cc: Benjamin Herrenschmidt Cc: David Miller Cc: Martin Schwidefsky Cc: Russell King Cc: Paul Mundt Cc: Jeff Dike Cc: Richard Weinberger Cc: Tony Luck Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Nick Piggin Cc: Namhyung Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/lockstat.txt | 2 +- Documentation/vm/locking | 2 +- arch/x86/mm/hugetlbpage.c | 4 ++-- fs/hugetlbfs/inode.c | 4 ++-- fs/inode.c | 2 +- include/linux/fs.h | 2 +- include/linux/mmu_notifier.h | 2 +- kernel/fork.c | 4 ++-- mm/filemap.c | 10 +++++----- mm/filemap_xip.c | 4 ++-- mm/fremap.c | 4 ++-- mm/hugetlb.c | 14 +++++++------- mm/memory-failure.c | 4 ++-- mm/memory.c | 4 ++-- mm/mmap.c | 22 +++++++++++----------- mm/mremap.c | 4 ++-- mm/rmap.c | 28 ++++++++++++++-------------- 17 files changed, 58 insertions(+), 58 deletions(-) (limited to 'mm/filemap.c') diff --git a/Documentation/lockstat.txt b/Documentation/lockstat.txt index 65f4c795015d..9c0a80d17a23 100644 --- a/Documentation/lockstat.txt +++ b/Documentation/lockstat.txt @@ -136,7 +136,7 @@ View the top contending locks: dcache_lock: 1037 1161 0.38 45.32 774.51 6611 243371 0.15 306.48 77387.24 &inode->i_mutex: 161 286 18446744073709 62882.54 1244614.55 3653 20598 18446744073709 62318.60 1693822.74 &zone->lru_lock: 94 94 0.53 7.33 92.10 4366 32690 0.29 59.81 16350.06 - &inode->i_data.i_mmap_lock: 79 79 0.40 3.77 53.03 11779 87755 0.28 116.93 29898.44 + &inode->i_data.i_mmap_mutex: 79 79 0.40 3.77 53.03 11779 87755 0.28 116.93 29898.44 &q->__queue_lock: 48 50 0.52 31.62 86.31 774 13131 0.17 113.08 12277.52 &rq->rq_lock_key: 43 47 0.74 68.50 170.63 3706 33929 0.22 107.99 17460.62 &rq->rq_lock_key#2: 39 46 0.75 6.68 49.03 2979 32292 0.17 125.17 17137.63 diff --git a/Documentation/vm/locking b/Documentation/vm/locking index 25fadb448760..f61228bd6395 100644 --- a/Documentation/vm/locking +++ b/Documentation/vm/locking @@ -66,7 +66,7 @@ in some cases it is not really needed. Eg, vm_start is modified by expand_stack(), it is hard to come up with a destructive scenario without having the vmlist protection in this case. -The page_table_lock nests with the inode i_mmap_lock and the kmem cache +The page_table_lock nests with the inode i_mmap_mutex and the kmem cache c_spinlock spinlocks. This is okay, since the kmem code asks for pages after dropping c_spinlock. The page_table_lock also nests with pagecache_lock and pagemap_lru_lock spinlocks, and no code asks for memory with these locks diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index d4203988504a..f581a18c0d4d 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -72,7 +72,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (!vma_shareable(vma, addr)) return; - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; @@ -97,7 +97,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) put_page(virt_to_page(spte)); spin_unlock(&mm->page_table_lock); out: - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); } /* diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index b9eeb1cd03ff..e7a035781b7d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -412,10 +412,10 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) pgoff = offset >> PAGE_SHIFT; i_size_write(inode, offset); - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); if (!prio_tree_empty(&mapping->i_mmap)) hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); truncate_hugepages(inode, offset); return 0; } diff --git a/fs/inode.c b/fs/inode.c index 7a7284c71abd..88aa3bcc7681 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -326,7 +326,7 @@ void address_space_init_once(struct address_space *mapping) memset(mapping, 0, sizeof(*mapping)); INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); spin_lock_init(&mapping->tree_lock); - spin_lock_init(&mapping->i_mmap_lock); + mutex_init(&mapping->i_mmap_mutex); INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); diff --git a/include/linux/fs.h b/include/linux/fs.h index 5d2c86bdf5ba..5bb9e826019b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -634,7 +634,7 @@ struct address_space { unsigned int i_mmap_writable;/* count VM_SHARED mappings */ struct prio_tree_root i_mmap; /* tree of private and shared mappings */ struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ - spinlock_t i_mmap_lock; /* protect tree, count, list */ + struct mutex i_mmap_mutex; /* protect tree, count, list */ unsigned long nrpages; /* number of total pages */ pgoff_t writeback_index;/* writeback starts here */ const struct address_space_operations *a_ops; /* methods */ diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index cc2e7dfea9d7..a877dfc243eb 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -150,7 +150,7 @@ struct mmu_notifier_ops { * Therefore notifier chains can only be traversed when either * * 1. mmap_sem is held. - * 2. One of the reverse map locks is held (i_mmap_lock or anon_vma->lock). + * 2. One of the reverse map locks is held (i_mmap_mutex or anon_vma->lock). * 3. No other concurrent thread can access the list (release) */ struct mmu_notifier { diff --git a/kernel/fork.c b/kernel/fork.c index 4eef925477fc..927692734bcf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -383,14 +383,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); if (tmp->vm_flags & VM_SHARED) mapping->i_mmap_writable++; flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ vma_prio_tree_add(tmp, mpnt); flush_dcache_mmap_unlock(mapping); - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); } /* diff --git a/mm/filemap.c b/mm/filemap.c index 8144f87dcbb4..88354ae0b1fd 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -58,16 +58,16 @@ /* * Lock ordering: * - * ->i_mmap_lock (truncate_pagecache) + * ->i_mmap_mutex (truncate_pagecache) * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_lock (exclusive_swap_page, others) * ->mapping->tree_lock * * ->i_mutex - * ->i_mmap_lock (truncate->unmap_mapping_range) + * ->i_mmap_mutex (truncate->unmap_mapping_range) * * ->mmap_sem - * ->i_mmap_lock + * ->i_mmap_mutex * ->page_table_lock or pte_lock (various, mainly in memory.c) * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) * @@ -84,7 +84,7 @@ * sb_lock (fs/fs-writeback.c) * ->mapping->tree_lock (__sync_single_inode) * - * ->i_mmap_lock + * ->i_mmap_mutex * ->anon_vma.lock (vma_adjust) * * ->anon_vma.lock @@ -106,7 +106,7 @@ * * (code doesn't rely on that order, so you could switch it around) * ->tasklist_lock (memory_failure, collect_procs_ao) - * ->i_mmap_lock + * ->i_mmap_mutex */ /* diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 83364df74a33..93356cd12828 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -183,7 +183,7 @@ __xip_unmap (struct address_space * mapping, return; retry: - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { mm = vma->vm_mm; address = vma->vm_start + @@ -201,7 +201,7 @@ retry: page_cache_release(page); } } - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); if (locked) { mutex_unlock(&xip_sparse_mutex); diff --git a/mm/fremap.c b/mm/fremap.c index ec520c7b28df..7f4123056e06 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -211,13 +211,13 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, } goto out; } - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); flush_dcache_mmap_lock(mapping); vma->vm_flags |= VM_NONLINEAR; vma_prio_tree_remove(vma, &mapping->i_mmap); vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); flush_dcache_mmap_unlock(mapping); - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); } if (vma->vm_flags & VM_LOCKED) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bbb4a5bbb958..5fd68b95c671 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2205,7 +2205,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long sz = huge_page_size(h); /* - * A page gathering list, protected by per file i_mmap_lock. The + * A page gathering list, protected by per file i_mmap_mutex. The * lock is used to avoid list corruption from multiple unmapping * of the same page since we are using page->lru. */ @@ -2274,9 +2274,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page) { - spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); + mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); __unmap_hugepage_range(vma, start, end, ref_page); - spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); + mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); } /* @@ -2308,7 +2308,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * this mapping should be shared between all the VMAs, * __unmap_hugepage_range() is called as the lock is already held */ - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { /* Do not unmap the current VMA */ if (iter_vma == vma) @@ -2326,7 +2326,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, address, address + huge_page_size(h), page); } - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); return 1; } @@ -2810,7 +2810,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma, BUG_ON(address >= end); flush_cache_range(vma, address, end); - spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); + mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); spin_lock(&mm->page_table_lock); for (; address < end; address += huge_page_size(h)) { ptep = huge_pte_offset(mm, address); @@ -2825,7 +2825,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma, } } spin_unlock(&mm->page_table_lock); - spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); + mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); flush_tlb_range(vma, start, end); } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2b9a5eef39e0..12178ec32ab5 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -429,7 +429,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, */ read_lock(&tasklist_lock); - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); for_each_process(tsk) { pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -449,7 +449,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, add_to_kill(tsk, page, vma, to_kill, tkc); } } - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); read_unlock(&tasklist_lock); } diff --git a/mm/memory.c b/mm/memory.c index 18655878b9f8..7bbe4d3df756 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2667,12 +2667,12 @@ void unmap_mapping_range(struct address_space *mapping, details.last_index = ULONG_MAX; - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); if (unlikely(!prio_tree_empty(&mapping->i_mmap))) unmap_mapping_range_tree(&mapping->i_mmap, &details); if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); } EXPORT_SYMBOL(unmap_mapping_range); diff --git a/mm/mmap.c b/mm/mmap.c index 50cb04bb56bf..26efbfca0b20 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -194,7 +194,7 @@ error: } /* - * Requires inode->i_mapping->i_mmap_lock + * Requires inode->i_mapping->i_mmap_mutex */ static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct file *file, struct address_space *mapping) @@ -222,9 +222,9 @@ void unlink_file_vma(struct vm_area_struct *vma) if (file) { struct address_space *mapping = file->f_mapping; - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); __remove_shared_vm_struct(vma, file, mapping); - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); } } @@ -446,13 +446,13 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, mapping = vma->vm_file->f_mapping; if (mapping) - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); __vma_link(mm, vma, prev, rb_link, rb_parent); __vma_link_file(vma); if (mapping) - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); mm->map_count++; validate_mm(mm); @@ -555,7 +555,7 @@ again: remove_next = 1 + (end > next->vm_end); mapping = file->f_mapping; if (!(vma->vm_flags & VM_NONLINEAR)) root = &mapping->i_mmap; - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); if (insert) { /* * Put into prio_tree now, so instantiated pages @@ -622,7 +622,7 @@ again: remove_next = 1 + (end > next->vm_end); if (anon_vma) anon_vma_unlock(anon_vma); if (mapping) - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); if (remove_next) { if (file) { @@ -2290,7 +2290,7 @@ void exit_mmap(struct mm_struct *mm) /* Insert vm structure into process list sorted by address * and into the inode's i_mmap tree. If vm_file is non-NULL - * then i_mmap_lock is taken here. + * then i_mmap_mutex is taken here. */ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) { @@ -2532,7 +2532,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) */ if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) BUG(); - spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem); + mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem); } } @@ -2559,7 +2559,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) * vma in this mm is backed by the same anon_vma or address_space. * * We can take all the locks in random order because the VM code - * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never + * taking i_mmap_mutex or anon_vma->lock outside the mmap_sem never * takes more than one of them in a row. Secondly we're protected * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. * @@ -2631,7 +2631,7 @@ static void vm_unlock_mapping(struct address_space *mapping) * AS_MM_ALL_LOCKS can't change to 0 from under us * because we hold the mm_all_locks_mutex. */ - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); if (!test_and_clear_bit(AS_MM_ALL_LOCKS, &mapping->flags)) BUG(); diff --git a/mm/mremap.c b/mm/mremap.c index 909e1e1e99b1..506fa44403df 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -93,7 +93,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, * and we propagate stale pages into the dst afterward. */ mapping = vma->vm_file->f_mapping; - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); } /* @@ -122,7 +122,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, pte_unmap(new_pte - 1); pte_unmap_unlock(old_pte - 1, old_ptl); if (mapping) - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end); } diff --git a/mm/rmap.c b/mm/rmap.c index 522e4a93cadd..f0ef7ea5423a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -24,7 +24,7 @@ * inode->i_alloc_sem (vmtruncate_range) * mm->mmap_sem * page->flags PG_locked (lock_page) - * mapping->i_mmap_lock + * mapping->i_mmap_mutex * anon_vma->lock * mm->page_table_lock or pte_lock * zone->lru_lock (in mark_page_accessed, isolate_lru_page) @@ -646,14 +646,14 @@ static int page_referenced_file(struct page *page, * The page lock not only makes sure that page->mapping cannot * suddenly be NULLified by truncation, it makes sure that the * structure at mapping cannot be freed and reused yet, - * so we can safely take mapping->i_mmap_lock. + * so we can safely take mapping->i_mmap_mutex. */ BUG_ON(!PageLocked(page)); - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); /* - * i_mmap_lock does not stabilize mapcount at all, but mapcount + * i_mmap_mutex does not stabilize mapcount at all, but mapcount * is more likely to be accurate if we note it after spinning. */ mapcount = page_mapcount(page); @@ -675,7 +675,7 @@ static int page_referenced_file(struct page *page, break; } - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); return referenced; } @@ -762,7 +762,7 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) BUG_ON(PageAnon(page)); - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { if (vma->vm_flags & VM_SHARED) { unsigned long address = vma_address(page, vma); @@ -771,7 +771,7 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) ret += page_mkclean_one(page, vma, address); } } - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); return ret; } @@ -1119,7 +1119,7 @@ out_mlock: /* * We need mmap_sem locking, Otherwise VM_LOCKED check makes * unstable result and race. Plus, We can't wait here because - * we now hold anon_vma->lock or mapping->i_mmap_lock. + * we now hold anon_vma->lock or mapping->i_mmap_mutex. * if trylock failed, the page remain in evictable lru and later * vmscan could retry to move the page to unevictable lru if the * page is actually mlocked. @@ -1345,7 +1345,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) unsigned long max_nl_size = 0; unsigned int mapcount; - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) @@ -1391,7 +1391,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) mapcount = page_mapcount(page); if (!mapcount) goto out; - cond_resched_lock(&mapping->i_mmap_lock); + cond_resched(); max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; if (max_nl_cursor == 0) @@ -1413,7 +1413,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) } vma->vm_private_data = (void *) max_nl_cursor; } - cond_resched_lock(&mapping->i_mmap_lock); + cond_resched(); max_nl_cursor += CLUSTER_SIZE; } while (max_nl_cursor <= max_nl_size); @@ -1425,7 +1425,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) vma->vm_private_data = NULL; out: - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); return ret; } @@ -1544,7 +1544,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, if (!mapping) return ret; - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) @@ -1558,7 +1558,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, * never contain migration ptes. Decide what to do about this * limitation to linear when we need rmap_walk() on nonlinear. */ - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); return ret; } -- cgit v1.2.3 From 275b12bf5486f6f531111fd3d7dbbf01df427cfe Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 24 May 2011 17:12:28 -0700 Subject: readahead: return early when readahead is disabled Reduce readahead overheads by returning early in do_sync_mmap_readahead(). tmpfs has ra_pages=0 and it can page fault really fast (not constraint by IO if not swapping). Signed-off-by: Wu Fengguang Tested-by: Tim Chen Reported-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 88354ae0b1fd..c974a2863897 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1556,6 +1556,8 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, /* If we don't want any read-ahead, don't bother */ if (VM_RandomReadHint(vma)) return; + if (!ra->ra_pages) + return; if (VM_SequentialReadHint(vma) || offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) { @@ -1578,12 +1580,10 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, * mmap read-around */ ra_pages = max_sane_readahead(ra->ra_pages); - if (ra_pages) { - ra->start = max_t(long, 0, offset - ra_pages/2); - ra->size = ra_pages; - ra->async_size = 0; - ra_submit(ra, mapping, file); - } + ra->start = max_t(long, 0, offset - ra_pages / 2); + ra->size = ra_pages; + ra->async_size = 0; + ra_submit(ra, mapping, file); } /* -- cgit v1.2.3 From 207d04baa3591a354711e863dd90087fc75873b3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 24 May 2011 17:12:29 -0700 Subject: readahead: reduce unnecessary mmap_miss increases The original INT_MAX is too large, reduce it to - avoid unnecessarily dirtying/bouncing the cache line - restore mmap read-around faster on changed access pattern Background: in the mosbench exim benchmark which does multi-threaded page faults on shared struct file, the ra->mmap_miss updates are found to cause excessive cache line bouncing on tmpfs. The ra state updates are needless for tmpfs because it actually disabled readahead totally (shmem_backing_dev_info.ra_pages == 0). Tested-by: Tim Chen Signed-off-by: Andi Kleen Signed-off-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index c974a2863897..e5131392d32e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1566,7 +1566,8 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, return; } - if (ra->mmap_miss < INT_MAX) + /* Avoid banging the cache line if not needed */ + if (ra->mmap_miss < MMAP_LOTSAMISS * 10) ra->mmap_miss++; /* -- cgit v1.2.3 From 2cbea1d3ab11946885d37a2461072ee4d687cb4e Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 24 May 2011 17:12:30 -0700 Subject: readahead: trigger mmap sequential readahead on PG_readahead Previously the mmap sequential readahead is triggered by updating ra->prev_pos on each page fault and compare it with current page offset. It costs dirtying the cache line on each _minor_ page fault. So remove the ra->prev_pos recording, and instead tag PG_readahead to trigger the possible sequential readahead. It's not only more simple, but also will work more reliably and reduce cache line bouncing on concurrent page faults on shared struct file. In the mosbench exim benchmark which does multi-threaded page faults on shared struct file, the ra->mmap_miss and ra->prev_pos updates are found to cause excessive cache line bouncing on tmpfs, which actually disabled readahead totally (shmem_backing_dev_info.ra_pages == 0). So remove the ra->prev_pos recording, and instead tag PG_readahead to trigger the possible sequential readahead. It's not only more simple, but also will work more reliably on concurrent reads on shared struct file. Signed-off-by: Wu Fengguang Tested-by: Tim Chen Reported-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index e5131392d32e..68e782b3d3de 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1559,8 +1559,7 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, if (!ra->ra_pages) return; - if (VM_SequentialReadHint(vma) || - offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) { + if (VM_SequentialReadHint(vma)) { page_cache_sync_readahead(mapping, ra, file, offset, ra->ra_pages); return; @@ -1583,7 +1582,7 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, ra_pages = max_sane_readahead(ra->ra_pages); ra->start = max_t(long, 0, offset - ra_pages / 2); ra->size = ra_pages; - ra->async_size = 0; + ra->async_size = ra_pages / 4; ra_submit(ra, mapping, file); } @@ -1689,7 +1688,6 @@ retry_find: return VM_FAULT_SIGBUS; } - ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT; vmf->page = page; return ret | VM_FAULT_LOCKED; -- cgit v1.2.3 From c515e1fd361c2a08a9c2eb139396ec30a4f477dc Mon Sep 17 00:00:00 2001 From: Dan Magenheimer Date: Thu, 26 May 2011 10:01:43 -0600 Subject: mm/fs: add hooks to support cleancache This fourth patch of eight in this cleancache series provides the core hooks in VFS for: initializing cleancache per filesystem; capturing clean pages reclaimed by page cache; attempting to get pages from cleancache before filesystem read; and ensuring coherency between pagecache, disk, and cleancache. Note that the placement of these hooks was stable from 2.6.18 to 2.6.38; a minor semantic change was required due to a patchset in 2.6.39. All hooks become no-ops if CONFIG_CLEANCACHE is unset, or become a check of a boolean global if CONFIG_CLEANCACHE is set but no cleancache "backend" has claimed cleancache_ops. Details and a FAQ can be found in Documentation/vm/cleancache.txt [v8: minchan.kim@gmail.com: adapt to new remove_from_page_cache function] Signed-off-by: Chris Mason Signed-off-by: Dan Magenheimer Reviewed-by: Jeremy Fitzhardinge Reviewed-by: Konrad Rzeszutek Wilk Cc: Andrew Morton Cc: Al Viro Cc: Matthew Wilcox Cc: Nick Piggin Cc: Mel Gorman Cc: Rik Van Riel Cc: Jan Beulich Cc: Andreas Dilger Cc: Ted Ts'o Cc: Mark Fasheh Cc: Joel Becker Cc: Nitin Gupta --- fs/buffer.c | 5 +++++ fs/mpage.c | 7 +++++++ fs/super.c | 3 +++ mm/filemap.c | 11 +++++++++++ mm/truncate.c | 6 ++++++ 5 files changed, 32 insertions(+) (limited to 'mm/filemap.c') diff --git a/fs/buffer.c b/fs/buffer.c index a08bb8e61c6f..de05703b184b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -41,6 +41,7 @@ #include #include #include +#include static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); @@ -269,6 +270,10 @@ void invalidate_bdev(struct block_device *bdev) invalidate_bh_lrus(); lru_add_drain_all(); /* make sure all lru add caches are flushed */ invalidate_mapping_pages(mapping, 0, -1); + /* 99% of the time, we don't need to flush the cleancache on the bdev. + * But, for the strange corners, lets be cautious + */ + cleancache_flush_inode(mapping); } EXPORT_SYMBOL(invalidate_bdev); diff --git a/fs/mpage.c b/fs/mpage.c index 0afc809e46e0..fdfae9fa98cd 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -27,6 +27,7 @@ #include #include #include +#include /* * I/O completion handler for multipage BIOs. @@ -271,6 +272,12 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, SetPageMappedToDisk(page); } + if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) && + cleancache_get_page(page) == 0) { + SetPageUptodate(page); + goto confused; + } + /* * This page will go to BIO. Do we need to send this BIO off first? */ diff --git a/fs/super.c b/fs/super.c index 8a06881b1920..b383fa407740 100644 --- a/fs/super.c +++ b/fs/super.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "internal.h" @@ -112,6 +113,7 @@ static struct super_block *alloc_super(struct file_system_type *type) s->s_maxbytes = MAX_NON_LFS; s->s_op = &default_op; s->s_time_gran = 1000000000; + s->cleancache_poolid = -1; } out: return s; @@ -177,6 +179,7 @@ void deactivate_locked_super(struct super_block *s) { struct file_system_type *fs = s->s_type; if (atomic_dec_and_test(&s->s_active)) { + cleancache_flush_fs(s); fs->kill_sb(s); /* * We need to call rcu_barrier so all the delayed rcu free diff --git a/mm/filemap.c b/mm/filemap.c index c641edf553a9..ec6fa2d7e200 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -34,6 +34,7 @@ #include /* for BUG_ON(!in_atomic()) only */ #include #include /* for page_is_file_cache() */ +#include #include "internal.h" /* @@ -118,6 +119,16 @@ void __delete_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; + /* + * if we're uptodate, flush out into the cleancache, otherwise + * invalidate any existing cleancache entries. We can't leave + * stale data around in the cleancache once our page is gone + */ + if (PageUptodate(page) && PageMappedToDisk(page)) + cleancache_put_page(page); + else + cleancache_flush_page(mapping, page); + radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; mapping->nrpages--; diff --git a/mm/truncate.c b/mm/truncate.c index a95667529135..3a29a6180212 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -19,6 +19,7 @@ #include #include /* grr. try_to_release_page, do_invalidatepage */ +#include #include "internal.h" @@ -51,6 +52,7 @@ void do_invalidatepage(struct page *page, unsigned long offset) static inline void truncate_partial_page(struct page *page, unsigned partial) { zero_user_segment(page, partial, PAGE_CACHE_SIZE); + cleancache_flush_page(page->mapping, page); if (page_has_private(page)) do_invalidatepage(page, partial); } @@ -214,6 +216,7 @@ void truncate_inode_pages_range(struct address_space *mapping, pgoff_t next; int i; + cleancache_flush_inode(mapping); if (mapping->nrpages == 0) return; @@ -291,6 +294,7 @@ void truncate_inode_pages_range(struct address_space *mapping, pagevec_release(&pvec); mem_cgroup_uncharge_end(); } + cleancache_flush_inode(mapping); } EXPORT_SYMBOL(truncate_inode_pages_range); @@ -440,6 +444,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, int did_range_unmap = 0; int wrapped = 0; + cleancache_flush_inode(mapping); pagevec_init(&pvec, 0); next = start; while (next <= end && !wrapped && @@ -498,6 +503,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, mem_cgroup_uncharge_end(); cond_resched(); } + cleancache_flush_inode(mapping); return ret; } EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); -- cgit v1.2.3 From 456f998ec817ebfa254464be4f089542fa390645 Mon Sep 17 00:00:00 2001 From: Ying Han Date: Thu, 26 May 2011 16:25:38 -0700 Subject: memcg: add the pagefault count into memcg stats Two new stats in per-memcg memory.stat which tracks the number of page faults and number of major page faults. "pgfault" "pgmajfault" They are different from "pgpgin"/"pgpgout" stat which count number of pages charged/discharged to the cgroup and have no meaning of reading/ writing page to disk. It is valuable to track the two stats for both measuring application's performance as well as the efficiency of the kernel page reclaim path. Counting pagefaults per process is useful, but we also need the aggregated value since processes are monitored and controlled in cgroup basis in memcg. Functional test: check the total number of pgfault/pgmajfault of all memcgs and compare with global vmstat value: $ cat /proc/vmstat | grep fault pgfault 1070751 pgmajfault 553 $ cat /dev/cgroup/memory.stat | grep fault pgfault 1071138 pgmajfault 553 total_pgfault 1071142 total_pgmajfault 553 $ cat /dev/cgroup/A/memory.stat | grep fault pgfault 199 pgmajfault 0 total_pgfault 199 total_pgmajfault 0 Performance test: run page fault test(pft) wit 16 thread on faulting in 15G anon pages in 16G container. There is no regression noticed on the "flt/cpu/s" Sample output from pft: TAG pft:anon-sys-default: Gb Thr CLine User System Wall flt/cpu/s fault/wsec 15 16 1 0.67s 233.41s 14.76s 16798.546 266356.260 +-------------------------------------------------------------------------+ N Min Max Median Avg Stddev x 10 16682.962 17344.027 16913.524 16928.812 166.5362 + 10 16695.568 16923.896 16820.604 16824.652 84.816568 No difference proven at 95.0% confidence [akpm@linux-foundation.org: fix build] [hughd@google.com: shmem fix] Signed-off-by: Ying Han Acked-by: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Reviewed-by: Minchan Kim Cc: Daisuke Nishimura Acked-by: Balbir Singh Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ncpfs/mmap.c | 2 ++ include/linux/memcontrol.h | 7 +++++++ mm/filemap.c | 1 + mm/memcontrol.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++ mm/memory.c | 2 ++ mm/shmem.c | 11 ++++++----- 6 files changed, 65 insertions(+), 5 deletions(-) (limited to 'mm/filemap.c') diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c index a7c07b44b100..e5d71b27a5b0 100644 --- a/fs/ncpfs/mmap.c +++ b/fs/ncpfs/mmap.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -92,6 +93,7 @@ static int ncp_file_mmap_fault(struct vm_area_struct *area, * -- wli */ count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(area->vm_mm, PGMAJFAULT); return VM_FAULT_MAJOR; } diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ac1e5d20916a..9724a38ee69d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -20,6 +20,8 @@ #ifndef _LINUX_MEMCONTROL_H #define _LINUX_MEMCONTROL_H #include +#include + struct mem_cgroup; struct page_cgroup; struct page; @@ -149,6 +151,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, unsigned long *total_scanned); u64 mem_cgroup_get_limit(struct mem_cgroup *mem); +void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); #ifdef CONFIG_TRANSPARENT_HUGEPAGE void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail); #endif @@ -357,6 +360,10 @@ static inline void mem_cgroup_split_huge_fixup(struct page *head, { } +static inline +void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) +{ +} #endif /* CONFIG_CGROUP_MEM_CONT */ #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM) diff --git a/mm/filemap.c b/mm/filemap.c index 7455ccd8bda8..bcdc393b6580 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1661,6 +1661,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) /* No page in the page cache at all */ do_sync_mmap_readahead(vma, ra, file, offset); count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; retry_find: page = find_get_page(mapping, offset); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4021fcd71b60..bd9052a5d3ad 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -94,6 +94,8 @@ enum mem_cgroup_events_index { MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ + MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ + MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ MEM_CGROUP_EVENTS_NSTATS, }; /* @@ -590,6 +592,16 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); } +void mem_cgroup_pgfault(struct mem_cgroup *mem, int val) +{ + this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); +} + +void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val) +{ + this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); +} + static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, enum mem_cgroup_events_index idx) { @@ -827,6 +839,33 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) return (mem == root_mem_cgroup); } +void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) +{ + struct mem_cgroup *mem; + + if (!mm) + return; + + rcu_read_lock(); + mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!mem)) + goto out; + + switch (idx) { + case PGMAJFAULT: + mem_cgroup_pgmajfault(mem, 1); + break; + case PGFAULT: + mem_cgroup_pgfault(mem, 1); + break; + default: + BUG(); + } +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL(mem_cgroup_count_vm_event); + /* * Following LRU functions are allowed to be used without PCG_LOCK. * Operations are called by routine of global LRU independently from memcg. @@ -3958,6 +3997,8 @@ enum { MCS_PGPGIN, MCS_PGPGOUT, MCS_SWAP, + MCS_PGFAULT, + MCS_PGMAJFAULT, MCS_INACTIVE_ANON, MCS_ACTIVE_ANON, MCS_INACTIVE_FILE, @@ -3980,6 +4021,8 @@ struct { {"pgpgin", "total_pgpgin"}, {"pgpgout", "total_pgpgout"}, {"swap", "total_swap"}, + {"pgfault", "total_pgfault"}, + {"pgmajfault", "total_pgmajfault"}, {"inactive_anon", "total_inactive_anon"}, {"active_anon", "total_active_anon"}, {"inactive_file", "total_inactive_file"}, @@ -4008,6 +4051,10 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); s->stat[MCS_SWAP] += val * PAGE_SIZE; } + val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT); + s->stat[MCS_PGFAULT] += val; + val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT); + s->stat[MCS_PGMAJFAULT] += val; /* per zone stat */ val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); diff --git a/mm/memory.c b/mm/memory.c index fc24f7d788bd..6953d3926e01 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2874,6 +2874,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Had to read the page from swap area: Major fault */ ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(mm, PGMAJFAULT); } else if (PageHWPoison(page)) { /* * hwpoisoned dirty swapcache pages are kept for killing @@ -3413,6 +3414,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, __set_current_state(TASK_RUNNING); count_vm_event(PGFAULT); + mem_cgroup_count_vm_event(mm, PGFAULT); /* do counter updates before entering really critical section. */ check_sync_rss_stat(current); diff --git a/mm/shmem.c b/mm/shmem.c index 69edb45a9f28..1acfb2687bfa 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1305,12 +1305,10 @@ repeat: swappage = lookup_swap_cache(swap); if (!swappage) { shmem_swp_unmap(entry); + spin_unlock(&info->lock); /* here we actually do the io */ - if (type && !(*type & VM_FAULT_MAJOR)) { - __count_vm_event(PGMAJFAULT); + if (type) *type |= VM_FAULT_MAJOR; - } - spin_unlock(&info->lock); swappage = shmem_swapin(swap, gfp, info, idx); if (!swappage) { spin_lock(&info->lock); @@ -1549,7 +1547,10 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); if (error) return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); - + if (ret & VM_FAULT_MAJOR) { + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + } return ret | VM_FAULT_LOCKED; } -- cgit v1.2.3 From 3d08bcc887a1c8d12be8d81f747ffa2e8a44b67b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 27 May 2011 12:23:34 -0700 Subject: mm: Wait for writeback when grabbing pages to begin a write When grabbing a page for a buffered IO write, the mm should wait for writeback on the page to complete so that the page does not become writable during the IO operation. This change is needed to provide page stability during writes for all filesystems. Signed-off-by: Darrick J. Wong Signed-off-by: Al Viro --- mm/filemap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index bcdc393b6580..dac95a24deac 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2327,7 +2327,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, repeat: page = find_lock_page(mapping, index); if (page) - return page; + goto found; page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); if (!page) @@ -2340,6 +2340,8 @@ repeat: goto repeat; return NULL; } +found: + wait_on_page_writeback(page); return page; } EXPORT_SYMBOL(grab_cache_page_write_begin); -- cgit v1.2.3 From 69b4573296469fd3f70cf7044693074980517067 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 28 May 2011 08:25:51 -0700 Subject: Cache xattr security drop check for write v2 Some recent benchmarking on btrfs showed that a major scaling bottleneck on large systems on btrfs is currently the xattr lookup on every write. Why xattr lookup on every write I hear you ask? write wants to drop suid and security related xattrs that could set o capabilities for executables. To do that it currently looks up security.capability on EVERY write (even for non executables) to decide whether to drop it or not. In btrfs this causes an additional tree walk, hitting some per file system locks and quite bad scalability. In a simple read workload on a 8S system I saw over 90% CPU time in spinlocks related to that. Chris Mason tells me this is also a problem in ext4, where it hits the global mbcache lock. This patch adds a simple per inode to avoid this problem. We only do the lookup once per file and then if there is no xattr cache the decision. All xattr changes clear the flag. I also used the same flag to avoid the suid check, although that one is pretty cheap. A file system can also set this flag when it creates the inode, if it has a cheap way to do so. This is done for some common file systems in followon patches. With this patch a major part of the lock contention disappears for btrfs. Some testing on smaller systems didn't show significant performance changes, but at least it helps the larger systems and is generally more efficient. v2: Rename is_sgid. add file system helper. Cc: chris.mason@oracle.com Cc: josef@redhat.com Cc: viro@zeniv.linux.org.uk Cc: agruen@linbit.com Cc: Serge E. Hallyn Signed-off-by: Andi Kleen Signed-off-by: Al Viro --- fs/attr.c | 7 +++++++ fs/xattr.c | 7 +++++-- include/linux/fs.h | 13 +++++++++++++ mm/filemap.c | 14 ++++++++++++-- 4 files changed, 37 insertions(+), 4 deletions(-) (limited to 'mm/filemap.c') diff --git a/fs/attr.c b/fs/attr.c index 91dbe2a107f2..caf2aa521e2b 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -175,6 +175,13 @@ int notify_change(struct dentry * dentry, struct iattr * attr) return -EPERM; } + if ((ia_valid & ATTR_MODE)) { + mode_t amode = attr->ia_mode; + /* Flag setting protected by i_mutex */ + if (is_sxid(amode)) + inode->i_flags &= ~S_NOSEC; + } + now = current_fs_time(inode->i_sb); attr->ia_ctime = now; diff --git a/fs/xattr.c b/fs/xattr.c index 4be2e7666d02..f060663ab70c 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -91,7 +91,11 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, { struct inode *inode = dentry->d_inode; int error = -EOPNOTSUPP; + int issec = !strncmp(name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN); + if (issec) + inode->i_flags &= ~S_NOSEC; if (inode->i_op->setxattr) { error = inode->i_op->setxattr(dentry, name, value, size, flags); if (!error) { @@ -99,8 +103,7 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, security_inode_post_setxattr(dentry, name, value, size, flags); } - } else if (!strncmp(name, XATTR_SECURITY_PREFIX, - XATTR_SECURITY_PREFIX_LEN)) { + } else if (issec) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; error = security_inode_setsecurity(inode, suffix, value, size, flags); diff --git a/include/linux/fs.h b/include/linux/fs.h index 573028df050d..c55d6b7cd5d6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -237,6 +237,7 @@ struct inodes_stat_t { #define S_PRIVATE 512 /* Inode is fs-internal */ #define S_IMA 1024 /* Inode has an associated IMA struct */ #define S_AUTOMOUNT 2048 /* Automount/referral quasi-directory */ +#define S_NOSEC 4096 /* no suid or xattr security attributes */ /* * Note that nosuid etc flags are inode-specific: setting some file-system @@ -273,6 +274,7 @@ struct inodes_stat_t { #define IS_PRIVATE(inode) ((inode)->i_flags & S_PRIVATE) #define IS_IMA(inode) ((inode)->i_flags & S_IMA) #define IS_AUTOMOUNT(inode) ((inode)->i_flags & S_AUTOMOUNT) +#define IS_NOSEC(inode) ((inode)->i_flags & S_NOSEC) /* the read-only stuff doesn't really belong here, but any other place is probably as bad and I don't want to create yet another include file. */ @@ -2582,5 +2584,16 @@ int __init get_filesystem_list(char *buf); #define OPEN_FMODE(flag) ((__force fmode_t)(((flag + 1) & O_ACCMODE) | \ (flag & __FMODE_NONOTIFY))) +static inline int is_sxid(mode_t mode) +{ + return (mode & S_ISUID) || ((mode & S_ISGID) && (mode & S_IXGRP)); +} + +static inline void inode_has_no_xattr(struct inode *inode) +{ + if (!is_sxid(inode->i_mode)) + inode->i_flags |= S_NOSEC; +} + #endif /* __KERNEL__ */ #endif /* _LINUX_FS_H */ diff --git a/mm/filemap.c b/mm/filemap.c index dac95a24deac..d7b10578a64b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1982,16 +1982,26 @@ static int __remove_suid(struct dentry *dentry, int kill) int file_remove_suid(struct file *file) { struct dentry *dentry = file->f_path.dentry; - int killsuid = should_remove_suid(dentry); - int killpriv = security_inode_need_killpriv(dentry); + struct inode *inode = dentry->d_inode; + int killsuid; + int killpriv; int error = 0; + /* Fast path for nothing security related */ + if (IS_NOSEC(inode)) + return 0; + + killsuid = should_remove_suid(dentry); + killpriv = security_inode_need_killpriv(dentry); + if (killpriv < 0) return killpriv; if (killpriv) error = security_inode_killpriv(dentry); if (!error && killsuid) error = __remove_suid(dentry, killsuid); + if (!error) + inode->i_flags |= S_NOSEC; return error; } -- cgit v1.2.3 From 9e1f1de02c2275d7172e18dc4e7c2065777611bf Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 3 Jun 2011 18:24:58 -0400 Subject: more conservative S_NOSEC handling Caching "we have already removed suid/caps" was overenthusiastic as merged. On network filesystems we might have had suid/caps set on another client, silently picked by this client on revalidate, all of that *without* clearing the S_NOSEC flag. AFAICS, the only reasonably sane way to deal with that is * new superblock flag; unless set, S_NOSEC is not going to be set. * local block filesystems set it in their ->mount() (more accurately, mount_bdev() does, so does btrfs ->mount(), users of mount_bdev() other than local block ones clear it) * if any network filesystem (or a cluster one) wants to use S_NOSEC, it'll need to set MS_NOSEC in sb->s_flags *AND* take care to clear S_NOSEC when inode attribute changes are picked from other clients. It's not an earth-shattering hole (anybody that can set suid on another client will almost certainly be able to write to the file before doing that anyway), but it's a bug that needs fixing. Signed-off-by: Al Viro --- fs/btrfs/super.c | 2 +- fs/fuse/inode.c | 2 ++ fs/ocfs2/super.c | 2 +- fs/super.c | 2 +- include/linux/fs.h | 3 ++- mm/filemap.c | 2 +- 6 files changed, 8 insertions(+), 5 deletions(-) (limited to 'mm/filemap.c') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9b2e7e5bc3ef..d158b672a2d2 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -819,7 +819,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, } else { char b[BDEVNAME_SIZE]; - s->s_flags = flags; + s->s_flags = flags | MS_NOSEC; strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); error = btrfs_fill_super(s, fs_devices, data, flags & MS_SILENT ? 1 : 0); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index cc6ec4b2f0ff..38f84cd48b67 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -921,6 +921,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (sb->s_flags & MS_MANDLOCK) goto err; + sb->s_flags &= ~MS_NOSEC; + if (!parse_fuse_opt((char *) data, &d, is_bdev)) goto err; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index cdbaf5e97308..56f61027236b 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1072,7 +1072,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = OCFS2_SUPER_MAGIC; - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_NOSEC)) | ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); /* Hard readonly mode only if: bdev_read_only, MS_RDONLY, diff --git a/fs/super.c b/fs/super.c index c75593953c52..ab3d672db0de 100644 --- a/fs/super.c +++ b/fs/super.c @@ -822,7 +822,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, } else { char b[BDEVNAME_SIZE]; - s->s_flags = flags; + s->s_flags = flags | MS_NOSEC; s->s_mode = mode; strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); sb_set_blocksize(s, block_size(bdev)); diff --git a/include/linux/fs.h b/include/linux/fs.h index c55d6b7cd5d6..646a1836152a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -208,6 +208,7 @@ struct inodes_stat_t { #define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */ #define MS_I_VERSION (1<<23) /* Update inode I_version field */ #define MS_STRICTATIME (1<<24) /* Always perform atime updates */ +#define MS_NOSEC (1<<28) #define MS_BORN (1<<29) #define MS_ACTIVE (1<<30) #define MS_NOUSER (1<<31) @@ -2591,7 +2592,7 @@ static inline int is_sxid(mode_t mode) static inline void inode_has_no_xattr(struct inode *inode) { - if (!is_sxid(inode->i_mode)) + if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & MS_NOSEC)) inode->i_flags |= S_NOSEC; } diff --git a/mm/filemap.c b/mm/filemap.c index d7b10578a64b..a8251a8d3457 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2000,7 +2000,7 @@ int file_remove_suid(struct file *file) error = security_inode_killpriv(dentry); if (!error && killsuid) error = __remove_suid(dentry, killsuid); - if (!error) + if (!error && (inode->i_sb->s_flags & MS_NOSEC)) inode->i_flags |= S_NOSEC; return error; -- cgit v1.2.3