diff options
Diffstat (limited to 'mm/filemap.c')
-rw-r--r-- | mm/filemap.c | 244 |
1 files changed, 173 insertions, 71 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 1aaea26556cc..38546dca58fe 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -414,7 +414,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, .range_end = end, }; - if (!mapping_cap_writeback_dirty(mapping) || + if (!mapping_can_writeback(mapping) || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; @@ -988,9 +988,43 @@ void __init pagecache_init(void) page_writeback_init(); } +/* + * The page wait code treats the "wait->flags" somewhat unusually, because + * we have multiple different kinds of waits, not just the usual "exclusive" + * one. + * + * We have: + * + * (a) no special bits set: + * + * We're just waiting for the bit to be released, and when a waker + * calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up, + * and remove it from the wait queue. + * + * Simple and straightforward. + * + * (b) WQ_FLAG_EXCLUSIVE: + * + * The waiter is waiting to get the lock, and only one waiter should + * be woken up to avoid any thundering herd behavior. We'll set the + * WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue. + * + * This is the traditional exclusive wait. + * + * (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM: + * + * The waiter is waiting to get the bit, and additionally wants the + * lock to be transferred to it for fair lock behavior. If the lock + * cannot be taken, we stop walking the wait queue without waking + * the waiter. + * + * This is the "fair lock handoff" case, and in addition to setting + * WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see + * that it now has the lock. + */ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) { - int ret; + unsigned int flags; struct wait_page_key *key = arg; struct wait_page_queue *wait_page = container_of(wait, struct wait_page_queue, wait); @@ -999,35 +1033,44 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, return 0; /* - * If it's an exclusive wait, we get the bit for it, and - * stop walking if we can't. - * - * If it's a non-exclusive wait, then the fact that this - * wake function was called means that the bit already - * was cleared, and we don't care if somebody then - * re-took it. + * If it's a lock handoff wait, we get the bit for it, and + * stop walking (and do not wake it up) if we can't. */ - ret = 0; - if (wait->flags & WQ_FLAG_EXCLUSIVE) { - if (test_and_set_bit(key->bit_nr, &key->page->flags)) + flags = wait->flags; + if (flags & WQ_FLAG_EXCLUSIVE) { + if (test_bit(key->bit_nr, &key->page->flags)) return -1; - ret = 1; + if (flags & WQ_FLAG_CUSTOM) { + if (test_and_set_bit(key->bit_nr, &key->page->flags)) + return -1; + flags |= WQ_FLAG_DONE; + } } - wait->flags |= WQ_FLAG_WOKEN; + /* + * We are holding the wait-queue lock, but the waiter that + * is waiting for this will be checking the flags without + * any locking. + * + * So update the flags atomically, and wake up the waiter + * afterwards to avoid any races. This store-release pairs + * with the load-acquire in wait_on_page_bit_common(). + */ + smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN); wake_up_state(wait->private, mode); /* * Ok, we have successfully done what we're waiting for, * and we can unconditionally remove the wait entry. * - * Note that this has to be the absolute last thing we do, - * since after list_del_init(&wait->entry) the wait entry + * Note that this pairs with the "finish_wait()" in the + * waiter, and has to be the absolute last thing we do. + * After this list_del_init(&wait->entry) the wait entry * might be de-allocated and the process might even have * exited. */ list_del_init_careful(&wait->entry); - return ret; + return (flags & WQ_FLAG_EXCLUSIVE) != 0; } static void wake_up_page_bit(struct page *page, int bit_nr) @@ -1107,8 +1150,8 @@ enum behavior { }; /* - * Attempt to check (or get) the page bit, and mark the - * waiter woken if successful. + * Attempt to check (or get) the page bit, and mark us done + * if successful. */ static inline bool trylock_page_bit_common(struct page *page, int bit_nr, struct wait_queue_entry *wait) @@ -1119,13 +1162,17 @@ static inline bool trylock_page_bit_common(struct page *page, int bit_nr, } else if (test_bit(bit_nr, &page->flags)) return false; - wait->flags |= WQ_FLAG_WOKEN; + wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE; return true; } +/* How many times do we accept lock stealing from under a waiter? */ +int sysctl_page_lock_unfairness = 5; + static inline int wait_on_page_bit_common(wait_queue_head_t *q, struct page *page, int bit_nr, int state, enum behavior behavior) { + int unfairness = sysctl_page_lock_unfairness; struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; bool thrashing = false; @@ -1143,11 +1190,18 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, } init_wait(wait); - wait->flags = behavior == EXCLUSIVE ? WQ_FLAG_EXCLUSIVE : 0; wait->func = wake_page_function; wait_page.page = page; wait_page.bit_nr = bit_nr; +repeat: + wait->flags = 0; + if (behavior == EXCLUSIVE) { + wait->flags = WQ_FLAG_EXCLUSIVE; + if (--unfairness < 0) + wait->flags |= WQ_FLAG_CUSTOM; + } + /* * Do one last check whether we can get the * page bit synchronously. @@ -1170,27 +1224,63 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, /* * From now on, all the logic will be based on - * the WQ_FLAG_WOKEN flag, and the and the page - * bit testing (and setting) will be - or has - * already been - done by the wake function. + * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to + * see whether the page bit testing has already + * been done by the wake function. * * We can drop our reference to the page. */ if (behavior == DROP) put_page(page); + /* + * Note that until the "finish_wait()", or until + * we see the WQ_FLAG_WOKEN flag, we need to + * be very careful with the 'wait->flags', because + * we may race with a waker that sets them. + */ for (;;) { + unsigned int flags; + set_current_state(state); - if (signal_pending_state(state, current)) + /* Loop until we've been woken or interrupted */ + flags = smp_load_acquire(&wait->flags); + if (!(flags & WQ_FLAG_WOKEN)) { + if (signal_pending_state(state, current)) + break; + + io_schedule(); + continue; + } + + /* If we were non-exclusive, we're done */ + if (behavior != EXCLUSIVE) break; - if (wait->flags & WQ_FLAG_WOKEN) + /* If the waker got the lock for us, we're done */ + if (flags & WQ_FLAG_DONE) break; - io_schedule(); + /* + * Otherwise, if we're getting the lock, we need to + * try to get it ourselves. + * + * And if that fails, we'll have to retry this all. + */ + if (unlikely(test_and_set_bit(bit_nr, &page->flags))) + goto repeat; + + wait->flags |= WQ_FLAG_DONE; + break; } + /* + * If a signal happened, this 'finish_wait()' may remove the last + * waiter from the wait-queues, but the PageWaiters bit will remain + * set. That's ok. The next wakeup will take care of it, and trying + * to do it here would be difficult and prone to races. + */ finish_wait(q, wait); if (thrashing) { @@ -1200,12 +1290,20 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, } /* - * A signal could leave PageWaiters set. Clearing it here if - * !waitqueue_active would be possible (by open-coding finish_wait), - * but still fail to catch it in the case of wait hash collision. We - * already can fail to clear wait hash collision cases, so don't - * bother with signals either. + * NOTE! The wait->flags weren't stable until we've done the + * 'finish_wait()', and we could have exited the loop above due + * to a signal, and had a wakeup event happen after the signal + * test but before the 'finish_wait()'. + * + * So only after the finish_wait() can we reliably determine + * if we got woken up or not, so we can now figure out the final + * return value based on that state without races. + * + * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive + * waiter, but an exclusive one requires WQ_FLAG_DONE. */ + if (behavior == EXCLUSIVE) + return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR; return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR; } @@ -1547,19 +1645,19 @@ EXPORT_SYMBOL(page_cache_prev_miss); /** * find_get_entry - find and get a page cache entry * @mapping: the address_space to search - * @offset: the page cache index + * @index: The page cache index. * * Looks up the page cache slot at @mapping & @offset. If there is a - * page cache page, it is returned with an increased refcount. + * page cache page, the head page is returned with an increased refcount. * * If the slot holds a shadow entry of a previously evicted page, or a * swap entry from shmem/tmpfs, it is returned. * - * Return: the found page or shadow entry, %NULL if nothing is found. + * Return: The head page or shadow entry, %NULL if nothing is found. */ -struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) +struct page *find_get_entry(struct address_space *mapping, pgoff_t index) { - XA_STATE(xas, &mapping->i_pages, offset); + XA_STATE(xas, &mapping->i_pages, index); struct page *page; rcu_read_lock(); @@ -1587,7 +1685,6 @@ repeat: put_page(page); goto repeat; } - page = find_subpage(page, offset); out: rcu_read_unlock(); @@ -1595,40 +1692,37 @@ out: } /** - * find_lock_entry - locate, pin and lock a page cache entry - * @mapping: the address_space to search - * @offset: the page cache index + * find_lock_entry - Locate and lock a page cache entry. + * @mapping: The address_space to search. + * @index: The page cache index. * - * Looks up the page cache slot at @mapping & @offset. If there is a - * page cache page, it is returned locked and with an increased - * refcount. + * Looks up the page at @mapping & @index. If there is a page in the + * cache, the head page is returned locked and with an increased refcount. * * If the slot holds a shadow entry of a previously evicted page, or a * swap entry from shmem/tmpfs, it is returned. * - * find_lock_entry() may sleep. - * - * Return: the found page or shadow entry, %NULL if nothing is found. + * Context: May sleep. + * Return: The head page or shadow entry, %NULL if nothing is found. */ -struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) +struct page *find_lock_entry(struct address_space *mapping, pgoff_t index) { struct page *page; repeat: - page = find_get_entry(mapping, offset); + page = find_get_entry(mapping, index); if (page && !xa_is_value(page)) { lock_page(page); /* Has the page been truncated? */ - if (unlikely(page_mapping(page) != mapping)) { + if (unlikely(page->mapping != mapping)) { unlock_page(page); put_page(page); goto repeat; } - VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page); + VM_BUG_ON_PAGE(!thp_contains(page, index), page); } return page; } -EXPORT_SYMBOL(find_lock_entry); /** * pagecache_get_page - Find and get a reference to a page. @@ -1643,6 +1737,8 @@ EXPORT_SYMBOL(find_lock_entry); * * * %FGP_ACCESSED - The page will be marked accessed. * * %FGP_LOCK - The page is returned locked. + * * %FGP_HEAD - If the page is present and a THP, return the head page + * rather than the exact page specified by the index. * * %FGP_CREAT - If no page is present then a new page is allocated using * @gfp_mask and added to the page cache and the VM's LRU list. * The page is returned locked and with an increased refcount. @@ -1683,12 +1779,12 @@ repeat: } /* Has the page been truncated? */ - if (unlikely(compound_head(page)->mapping != mapping)) { + if (unlikely(page->mapping != mapping)) { unlock_page(page); put_page(page); goto repeat; } - VM_BUG_ON_PAGE(page->index != index, page); + VM_BUG_ON_PAGE(!thp_contains(page, index), page); } if (fgp_flags & FGP_ACCESSED) @@ -1698,11 +1794,13 @@ repeat: if (page_is_idle(page)) clear_page_idle(page); } + if (!(fgp_flags & FGP_HEAD)) + page = find_subpage(page, index); no_page: if (!page && (fgp_flags & FGP_CREAT)) { int err; - if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping)) + if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping)) gfp_mask |= __GFP_WRITE; if (fgp_flags & FGP_NOFS) gfp_mask &= ~__GFP_FS; @@ -2267,7 +2365,11 @@ readpage: } if (!PageUptodate(page)) { - error = lock_page_killable(page); + if (iocb->ki_flags & IOCB_WAITQ) + error = lock_page_async(page, iocb->ki_waitq); + else + error = lock_page_killable(page); + if (unlikely(error)) goto readpage_error; if (!PageUptodate(page)) { @@ -2691,42 +2793,42 @@ void filemap_map_pages(struct vm_fault *vmf, pgoff_t last_pgoff = start_pgoff; unsigned long max_idx; XA_STATE(xas, &mapping->i_pages, start_pgoff); - struct page *page; + struct page *head, *page; unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); rcu_read_lock(); - xas_for_each(&xas, page, end_pgoff) { - if (xas_retry(&xas, page)) + xas_for_each(&xas, head, end_pgoff) { + if (xas_retry(&xas, head)) continue; - if (xa_is_value(page)) + if (xa_is_value(head)) goto next; /* * Check for a locked page first, as a speculative * reference may adversely influence page migration. */ - if (PageLocked(page)) + if (PageLocked(head)) goto next; - if (!page_cache_get_speculative(page)) + if (!page_cache_get_speculative(head)) goto next; /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(&xas))) + if (unlikely(head != xas_reload(&xas))) goto skip; - page = find_subpage(page, xas.xa_index); + page = find_subpage(head, xas.xa_index); - if (!PageUptodate(page) || + if (!PageUptodate(head) || PageReadahead(page) || PageHWPoison(page)) goto skip; - if (!trylock_page(page)) + if (!trylock_page(head)) goto skip; - if (page->mapping != mapping || !PageUptodate(page)) + if (head->mapping != mapping || !PageUptodate(head)) goto unlock; max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); - if (page->index >= max_idx) + if (xas.xa_index >= max_idx) goto unlock; if (mmap_miss > 0) @@ -2738,12 +2840,12 @@ void filemap_map_pages(struct vm_fault *vmf, last_pgoff = xas.xa_index; if (alloc_set_pte(vmf, page)) goto unlock; - unlock_page(page); + unlock_page(head); goto next; unlock: - unlock_page(page); + unlock_page(head); skip: - put_page(page); + put_page(head); next: /* Huge page is mapped? No need to proceed. */ if (pmd_trans_huge(*vmf->pmd)) |