summaryrefslogtreecommitdiff
path: root/mm/filemap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/filemap.c')
-rw-r--r--mm/filemap.c206
1 files changed, 114 insertions, 92 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index bc943867d68c..7c00f105845e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -101,7 +101,7 @@
* ->tree_lock (page_remove_rmap->set_page_dirty)
* bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
* ->inode->i_lock (page_remove_rmap->set_page_dirty)
- * ->memcg->move_lock (page_remove_rmap->mem_cgroup_begin_page_stat)
+ * ->memcg->move_lock (page_remove_rmap->lock_page_memcg)
* bdi.wb->list_lock (zap_pte_range->set_page_dirty)
* ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->__set_page_dirty_buffers)
@@ -176,11 +176,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
/*
* Delete a page from the page cache and free it. Caller has to make
* sure the page is locked and that nobody else uses it - or that usage
- * is safe. The caller must hold the mapping's tree_lock and
- * mem_cgroup_begin_page_stat().
+ * is safe. The caller must hold the mapping's tree_lock.
*/
-void __delete_from_page_cache(struct page *page, void *shadow,
- struct mem_cgroup *memcg)
+void __delete_from_page_cache(struct page *page, void *shadow)
{
struct address_space *mapping = page->mapping;
@@ -195,6 +193,30 @@ void __delete_from_page_cache(struct page *page, void *shadow,
else
cleancache_invalidate_page(mapping, page);
+ VM_BUG_ON_PAGE(page_mapped(page), page);
+ if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) {
+ int mapcount;
+
+ pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n",
+ current->comm, page_to_pfn(page));
+ dump_page(page, "still mapped when deleted");
+ dump_stack();
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+
+ mapcount = page_mapcount(page);
+ if (mapping_exiting(mapping) &&
+ page_count(page) >= mapcount + 2) {
+ /*
+ * All vmas have already been torn down, so it's
+ * a good bet that actually the page is unmapped,
+ * and we'd prefer not to leak it: if we're wrong,
+ * some other bad page check should catch it later.
+ */
+ page_mapcount_reset(page);
+ atomic_sub(mapcount, &page->_count);
+ }
+ }
+
page_cache_tree_delete(mapping, page, shadow);
page->mapping = NULL;
@@ -205,7 +227,6 @@ void __delete_from_page_cache(struct page *page, void *shadow,
__dec_zone_page_state(page, NR_FILE_PAGES);
if (PageSwapBacked(page))
__dec_zone_page_state(page, NR_SHMEM);
- VM_BUG_ON_PAGE(page_mapped(page), page);
/*
* At this point page must be either written or cleaned by truncate.
@@ -216,8 +237,7 @@ void __delete_from_page_cache(struct page *page, void *shadow,
* anyway will be cleared before returning page into buddy allocator.
*/
if (WARN_ON_ONCE(PageDirty(page)))
- account_page_cleaned(page, mapping, memcg,
- inode_to_wb(mapping->host));
+ account_page_cleaned(page, mapping, inode_to_wb(mapping->host));
}
/**
@@ -231,7 +251,6 @@ void __delete_from_page_cache(struct page *page, void *shadow,
void delete_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
- struct mem_cgroup *memcg;
unsigned long flags;
void (*freepage)(struct page *);
@@ -240,11 +259,9 @@ void delete_from_page_cache(struct page *page)
freepage = mapping->a_ops->freepage;
- memcg = mem_cgroup_begin_page_stat(page);
spin_lock_irqsave(&mapping->tree_lock, flags);
- __delete_from_page_cache(page, NULL, memcg);
+ __delete_from_page_cache(page, NULL);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
if (freepage)
freepage(page);
@@ -446,7 +463,8 @@ int filemap_write_and_wait(struct address_space *mapping)
{
int err = 0;
- if (mapping->nrpages) {
+ if ((!dax_mapping(mapping) && mapping->nrpages) ||
+ (dax_mapping(mapping) && mapping->nrexceptional)) {
err = filemap_fdatawrite(mapping);
/*
* Even if the above returned error, the pages may be
@@ -482,13 +500,8 @@ int filemap_write_and_wait_range(struct address_space *mapping,
{
int err = 0;
- if (dax_mapping(mapping) && mapping->nrexceptional) {
- err = dax_writeback_mapping_range(mapping, lstart, lend);
- if (err)
- return err;
- }
-
- if (mapping->nrpages) {
+ if ((!dax_mapping(mapping) && mapping->nrpages) ||
+ (dax_mapping(mapping) && mapping->nrexceptional)) {
err = __filemap_fdatawrite_range(mapping, lstart, lend,
WB_SYNC_ALL);
/* See comment of filemap_write_and_wait() */
@@ -532,7 +545,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
if (!error) {
struct address_space *mapping = old->mapping;
void (*freepage)(struct page *);
- struct mem_cgroup *memcg;
unsigned long flags;
pgoff_t offset = old->index;
@@ -542,9 +554,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
new->mapping = mapping;
new->index = offset;
- memcg = mem_cgroup_begin_page_stat(old);
spin_lock_irqsave(&mapping->tree_lock, flags);
- __delete_from_page_cache(old, NULL, memcg);
+ __delete_from_page_cache(old, NULL);
error = radix_tree_insert(&mapping->page_tree, offset, new);
BUG_ON(error);
mapping->nrpages++;
@@ -557,8 +568,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
if (PageSwapBacked(new))
__inc_zone_page_state(new, NR_SHMEM);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
- mem_cgroup_replace_page(old, new);
+ mem_cgroup_migrate(old, new);
radix_tree_preload_end();
if (freepage)
freepage(old);
@@ -576,7 +586,7 @@ static int page_cache_tree_insert(struct address_space *mapping,
void **slot;
int error;
- error = __radix_tree_create(&mapping->page_tree, page->index,
+ error = __radix_tree_create(&mapping->page_tree, page->index, 0,
&node, &slot);
if (error)
return error;
@@ -1245,7 +1255,6 @@ unsigned find_get_entries(struct address_space *mapping,
return 0;
rcu_read_lock();
-restart:
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
struct page *page;
repeat:
@@ -1253,8 +1262,10 @@ repeat:
if (unlikely(!page))
continue;
if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- goto restart;
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
/*
* A shadow entry of a recently evicted page, a swap
* entry from shmem/tmpfs or a DAX entry. Return it
@@ -1307,7 +1318,6 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
return 0;
rcu_read_lock();
-restart:
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
struct page *page;
repeat:
@@ -1317,13 +1327,8 @@ repeat:
if (radix_tree_exception(page)) {
if (radix_tree_deref_retry(page)) {
- /*
- * Transient condition which can only trigger
- * when entry at index 0 moves out of or back
- * to root: none yet gotten, safe to restart.
- */
- WARN_ON(iter.index);
- goto restart;
+ slot = radix_tree_iter_retry(&iter);
+ continue;
}
/*
* A shadow entry of a recently evicted page,
@@ -1374,7 +1379,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
return 0;
rcu_read_lock();
-restart:
radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) {
struct page *page;
repeat:
@@ -1385,12 +1389,8 @@ repeat:
if (radix_tree_exception(page)) {
if (radix_tree_deref_retry(page)) {
- /*
- * Transient condition which can only trigger
- * when entry at index 0 moves out of or back
- * to root: none yet gotten, safe to restart.
- */
- goto restart;
+ slot = radix_tree_iter_retry(&iter);
+ continue;
}
/*
* A shadow entry of a recently evicted page,
@@ -1450,7 +1450,6 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
return 0;
rcu_read_lock();
-restart:
radix_tree_for_each_tagged(slot, &mapping->page_tree,
&iter, *index, tag) {
struct page *page;
@@ -1461,12 +1460,8 @@ repeat:
if (radix_tree_exception(page)) {
if (radix_tree_deref_retry(page)) {
- /*
- * Transient condition which can only trigger
- * when entry at index 0 moves out of or back
- * to root: none yet gotten, safe to restart.
- */
- goto restart;
+ slot = radix_tree_iter_retry(&iter);
+ continue;
}
/*
* A shadow entry of a recently evicted page.
@@ -1529,7 +1524,6 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
return 0;
rcu_read_lock();
-restart:
radix_tree_for_each_tagged(slot, &mapping->page_tree,
&iter, start, tag) {
struct page *page;
@@ -1539,12 +1533,8 @@ repeat:
continue;
if (radix_tree_exception(page)) {
if (radix_tree_deref_retry(page)) {
- /*
- * Transient condition which can only trigger
- * when entry at index 0 moves out of or back
- * to root: none yet gotten, safe to restart.
- */
- goto restart;
+ slot = radix_tree_iter_retry(&iter);
+ continue;
}
/*
@@ -1649,6 +1639,15 @@ find_page:
index, last_index - index);
}
if (!PageUptodate(page)) {
+ /*
+ * See comment in do_read_cache_page on why
+ * wait_on_page_locked is used to avoid unnecessarily
+ * serialisations and why it's safe.
+ */
+ wait_on_page_locked_killable(page);
+ if (PageUptodate(page))
+ goto page_ok;
+
if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
!mapping->a_ops->is_partially_uptodate)
goto page_not_up_to_date;
@@ -1890,6 +1889,7 @@ EXPORT_SYMBOL(generic_file_read_iter);
* page_cache_read - adds requested page to the page cache if not already there
* @file: file to read
* @offset: page index
+ * @gfp_mask: memory allocation flags
*
* This adds the requested page to the page cache if it isn't already there,
* and schedules an I/O to read in its contents from disk.
@@ -2151,10 +2151,11 @@ repeat:
if (unlikely(!page))
goto next;
if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- break;
- else
- goto next;
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+ goto next;
}
if (!page_cache_get_speculative(page))
@@ -2283,7 +2284,7 @@ static struct page *wait_on_page_read(struct page *page)
return page;
}
-static struct page *__read_cache_page(struct address_space *mapping,
+static struct page *do_read_cache_page(struct address_space *mapping,
pgoff_t index,
int (*filler)(void *, struct page *),
void *data,
@@ -2305,53 +2306,74 @@ repeat:
/* Presumably ENOMEM for radix tree node */
return ERR_PTR(err);
}
+
+filler:
err = filler(data, page);
if (err < 0) {
page_cache_release(page);
- page = ERR_PTR(err);
- } else {
- page = wait_on_page_read(page);
+ return ERR_PTR(err);
}
- }
- return page;
-}
-
-static struct page *do_read_cache_page(struct address_space *mapping,
- pgoff_t index,
- int (*filler)(void *, struct page *),
- void *data,
- gfp_t gfp)
-{
- struct page *page;
- int err;
+ page = wait_on_page_read(page);
+ if (IS_ERR(page))
+ return page;
+ goto out;
+ }
+ if (PageUptodate(page))
+ goto out;
-retry:
- page = __read_cache_page(mapping, index, filler, data, gfp);
- if (IS_ERR(page))
- return page;
+ /*
+ * Page is not up to date and may be locked due one of the following
+ * case a: Page is being filled and the page lock is held
+ * case b: Read/write error clearing the page uptodate status
+ * case c: Truncation in progress (page locked)
+ * case d: Reclaim in progress
+ *
+ * Case a, the page will be up to date when the page is unlocked.
+ * There is no need to serialise on the page lock here as the page
+ * is pinned so the lock gives no additional protection. Even if the
+ * the page is truncated, the data is still valid if PageUptodate as
+ * it's a race vs truncate race.
+ * Case b, the page will not be up to date
+ * Case c, the page may be truncated but in itself, the data may still
+ * be valid after IO completes as it's a read vs truncate race. The
+ * operation must restart if the page is not uptodate on unlock but
+ * otherwise serialising on page lock to stabilise the mapping gives
+ * no additional guarantees to the caller as the page lock is
+ * released before return.
+ * Case d, similar to truncation. If reclaim holds the page lock, it
+ * will be a race with remove_mapping that determines if the mapping
+ * is valid on unlock but otherwise the data is valid and there is
+ * no need to serialise with page lock.
+ *
+ * As the page lock gives no additional guarantee, we optimistically
+ * wait on the page to be unlocked and check if it's up to date and
+ * use the page if it is. Otherwise, the page lock is required to
+ * distinguish between the different cases. The motivation is that we
+ * avoid spurious serialisations and wakeups when multiple processes
+ * wait on the same page for IO to complete.
+ */
+ wait_on_page_locked(page);
if (PageUptodate(page))
goto out;
+ /* Distinguish between all the cases under the safety of the lock */
lock_page(page);
+
+ /* Case c or d, restart the operation */
if (!page->mapping) {
unlock_page(page);
page_cache_release(page);
- goto retry;
+ goto repeat;
}
+
+ /* Someone else locked and filled the page in a very small window */
if (PageUptodate(page)) {
unlock_page(page);
goto out;
}
- err = filler(data, page);
- if (err < 0) {
- page_cache_release(page);
- return ERR_PTR(err);
- } else {
- page = wait_on_page_read(page);
- if (IS_ERR(page))
- return page;
- }
+ goto filler;
+
out:
mark_page_accessed(page);
return page;