summaryrefslogtreecommitdiff
path: root/mm/filemap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/filemap.c')
-rw-r--r--mm/filemap.c430
1 files changed, 139 insertions, 291 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 21781f1fe52b..a82fbe4c9e8e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,6 +33,7 @@
#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
#include <linux/memcontrol.h>
#include <linux/cleancache.h>
+#include <linux/rmap.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
@@ -76,7 +77,7 @@
* ->mmap_sem
* ->lock_page (access_process_vm)
*
- * ->i_mutex (generic_file_buffered_write)
+ * ->i_mutex (generic_perform_write)
* ->mmap_sem (fault_in_pages_readable->do_page_fault)
*
* bdi->wb.list_lock
@@ -562,7 +563,7 @@ static int __add_to_page_cache_locked(struct page *page,
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapBacked(page), page);
- error = mem_cgroup_cache_charge(page, current->mm,
+ error = mem_cgroup_charge_file(page, current->mm,
gfp_mask & GFP_RECLAIM_MASK);
if (error)
return error;
@@ -1427,7 +1428,8 @@ static void shrink_readahead_size_eio(struct file *filp,
* do_generic_file_read - generic file read routine
* @filp: the file to read
* @ppos: current file position
- * @desc: read_descriptor
+ * @iter: data destination
+ * @written: already copied
*
* This is a generic file read routine, and uses the
* mapping->a_ops->readpage() function for the actual low-level stuff.
@@ -1435,8 +1437,8 @@ static void shrink_readahead_size_eio(struct file *filp,
* This is really ugly. But the goto's actually try to clarify some
* of the logic when it comes to error handling etc.
*/
-static void do_generic_file_read(struct file *filp, loff_t *ppos,
- read_descriptor_t *desc)
+static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
+ struct iov_iter *iter, ssize_t written)
{
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
@@ -1446,12 +1448,12 @@ static void do_generic_file_read(struct file *filp, loff_t *ppos,
pgoff_t prev_index;
unsigned long offset; /* offset into pagecache page */
unsigned int prev_offset;
- int error;
+ int error = 0;
index = *ppos >> PAGE_CACHE_SHIFT;
prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
- last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
+ last_index = (*ppos + iter->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
offset = *ppos & ~PAGE_CACHE_MASK;
for (;;) {
@@ -1486,7 +1488,7 @@ find_page:
if (!page->mapping)
goto page_not_up_to_date_locked;
if (!mapping->a_ops->is_partially_uptodate(page,
- desc, offset))
+ offset, iter->count))
goto page_not_up_to_date_locked;
unlock_page(page);
}
@@ -1536,24 +1538,23 @@ page_ok:
/*
* Ok, we have the page, and it's up-to-date, so
* now we can copy it to user space...
- *
- * The file_read_actor routine returns how many bytes were
- * actually used..
- * NOTE! This may not be the same as how much of a user buffer
- * we filled up (we may be padding etc), so we can only update
- * "pos" here (the actor routine has to update the user buffer
- * pointers and the remaining count).
*/
- ret = file_read_actor(desc, page, offset, nr);
+
+ ret = copy_page_to_iter(page, offset, nr, iter);
offset += ret;
index += offset >> PAGE_CACHE_SHIFT;
offset &= ~PAGE_CACHE_MASK;
prev_offset = offset;
page_cache_release(page);
- if (ret == nr && desc->count)
- continue;
- goto out;
+ written += ret;
+ if (!iov_iter_count(iter))
+ goto out;
+ if (ret < nr) {
+ error = -EFAULT;
+ goto out;
+ }
+ continue;
page_not_up_to_date:
/* Get exclusive access to the page ... */
@@ -1588,6 +1589,7 @@ readpage:
if (unlikely(error)) {
if (error == AOP_TRUNCATED_PAGE) {
page_cache_release(page);
+ error = 0;
goto find_page;
}
goto readpage_error;
@@ -1618,7 +1620,6 @@ readpage:
readpage_error:
/* UHHUH! A synchronous read error occurred. Report it */
- desc->error = error;
page_cache_release(page);
goto out;
@@ -1629,16 +1630,17 @@ no_cached_page:
*/
page = page_cache_alloc_cold(mapping);
if (!page) {
- desc->error = -ENOMEM;
+ error = -ENOMEM;
goto out;
}
error = add_to_page_cache_lru(page, mapping,
index, GFP_KERNEL);
if (error) {
page_cache_release(page);
- if (error == -EEXIST)
+ if (error == -EEXIST) {
+ error = 0;
goto find_page;
- desc->error = error;
+ }
goto out;
}
goto readpage;
@@ -1651,44 +1653,7 @@ out:
*ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
file_accessed(filp);
-}
-
-int file_read_actor(read_descriptor_t *desc, struct page *page,
- unsigned long offset, unsigned long size)
-{
- char *kaddr;
- unsigned long left, count = desc->count;
-
- if (size > count)
- size = count;
-
- /*
- * Faults on the destination of a read are common, so do it before
- * taking the kmap.
- */
- if (!fault_in_pages_writeable(desc->arg.buf, size)) {
- kaddr = kmap_atomic(page);
- left = __copy_to_user_inatomic(desc->arg.buf,
- kaddr + offset, size);
- kunmap_atomic(kaddr);
- if (left == 0)
- goto success;
- }
-
- /* Do it the slow way */
- kaddr = kmap(page);
- left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
- kunmap(page);
-
- if (left) {
- size -= left;
- desc->error = -EFAULT;
- }
-success:
- desc->count = count - size;
- desc->written += size;
- desc->arg.buf += size;
- return size;
+ return written ? written : error;
}
/*
@@ -1746,14 +1711,15 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
{
struct file *filp = iocb->ki_filp;
ssize_t retval;
- unsigned long seg = 0;
size_t count;
loff_t *ppos = &iocb->ki_pos;
+ struct iov_iter i;
count = 0;
retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
if (retval)
return retval;
+ iov_iter_init(&i, iov, nr_segs, count, 0);
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
if (filp->f_flags & O_DIRECT) {
@@ -1775,6 +1741,11 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
if (retval > 0) {
*ppos = pos + retval;
count -= retval;
+ /*
+ * If we did a short DIO read we need to skip the
+ * section of the iov that we've already read data into.
+ */
+ iov_iter_advance(&i, retval);
}
/*
@@ -1791,39 +1762,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
}
}
- count = retval;
- for (seg = 0; seg < nr_segs; seg++) {
- read_descriptor_t desc;
- loff_t offset = 0;
-
- /*
- * If we did a short DIO read we need to skip the section of the
- * iov that we've already read data into.
- */
- if (count) {
- if (count > iov[seg].iov_len) {
- count -= iov[seg].iov_len;
- continue;
- }
- offset = count;
- count = 0;
- }
-
- desc.written = 0;
- desc.arg.buf = iov[seg].iov_base + offset;
- desc.count = iov[seg].iov_len - offset;
- if (desc.count == 0)
- continue;
- desc.error = 0;
- do_generic_file_read(filp, ppos, &desc);
- retval += desc.written;
- if (desc.error) {
- retval = retval ?: desc.error;
- break;
- }
- if (desc.count > 0)
- break;
- }
+ retval = do_generic_file_read(filp, ppos, &i, retval);
out:
return retval;
}
@@ -1952,11 +1891,11 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
struct inode *inode = mapping->host;
pgoff_t offset = vmf->pgoff;
struct page *page;
- pgoff_t size;
+ loff_t size;
int ret = 0;
- size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- if (offset >= size)
+ size = round_up(i_size_read(inode), PAGE_CACHE_SIZE);
+ if (offset >= size >> PAGE_CACHE_SHIFT)
return VM_FAULT_SIGBUS;
/*
@@ -2005,8 +1944,8 @@ retry_find:
* Found the page and have a reference on it.
* We must recheck i_size under page lock.
*/
- size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- if (unlikely(offset >= size)) {
+ size = round_up(i_size_read(inode), PAGE_CACHE_SIZE);
+ if (unlikely(offset >= size >> PAGE_CACHE_SHIFT)) {
unlock_page(page);
page_cache_release(page);
return VM_FAULT_SIGBUS;
@@ -2064,6 +2003,78 @@ page_not_uptodate:
}
EXPORT_SYMBOL(filemap_fault);
+void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct radix_tree_iter iter;
+ void **slot;
+ struct file *file = vma->vm_file;
+ struct address_space *mapping = file->f_mapping;
+ loff_t size;
+ struct page *page;
+ unsigned long address = (unsigned long) vmf->virtual_address;
+ unsigned long addr;
+ pte_t *pte;
+
+ rcu_read_lock();
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, vmf->pgoff) {
+ if (iter.index > vmf->max_pgoff)
+ break;
+repeat:
+ page = radix_tree_deref_slot(slot);
+ if (unlikely(!page))
+ goto next;
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page))
+ break;
+ else
+ goto next;
+ }
+
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ /* Has the page moved? */
+ if (unlikely(page != *slot)) {
+ page_cache_release(page);
+ goto repeat;
+ }
+
+ if (!PageUptodate(page) ||
+ PageReadahead(page) ||
+ PageHWPoison(page))
+ goto skip;
+ if (!trylock_page(page))
+ goto skip;
+
+ if (page->mapping != mapping || !PageUptodate(page))
+ goto unlock;
+
+ size = round_up(i_size_read(mapping->host), PAGE_CACHE_SIZE);
+ if (page->index >= size >> PAGE_CACHE_SHIFT)
+ goto unlock;
+
+ pte = vmf->pte + page->index - vmf->pgoff;
+ if (!pte_none(*pte))
+ goto unlock;
+
+ if (file->f_ra.mmap_miss > 0)
+ file->f_ra.mmap_miss--;
+ addr = address + (page->index - vmf->pgoff) * PAGE_SIZE;
+ do_set_pte(vma, addr, page, pte, false, false);
+ unlock_page(page);
+ goto next;
+unlock:
+ unlock_page(page);
+skip:
+ page_cache_release(page);
+next:
+ if (iter.index == vmf->max_pgoff)
+ break;
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(filemap_map_pages);
+
int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct page *page = vmf->page;
@@ -2093,6 +2104,7 @@ EXPORT_SYMBOL(filemap_page_mkwrite);
const struct vm_operations_struct generic_file_vm_ops = {
.fault = filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = filemap_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
@@ -2261,150 +2273,6 @@ struct page *read_cache_page_gfp(struct address_space *mapping,
}
EXPORT_SYMBOL(read_cache_page_gfp);
-static size_t __iovec_copy_from_user_inatomic(char *vaddr,
- const struct iovec *iov, size_t base, size_t bytes)
-{
- size_t copied = 0, left = 0;
-
- while (bytes) {
- char __user *buf = iov->iov_base + base;
- int copy = min(bytes, iov->iov_len - base);
-
- base = 0;
- left = __copy_from_user_inatomic(vaddr, buf, copy);
- copied += copy;
- bytes -= copy;
- vaddr += copy;
- iov++;
-
- if (unlikely(left))
- break;
- }
- return copied - left;
-}
-
-/*
- * Copy as much as we can into the page and return the number of bytes which
- * were successfully copied. If a fault is encountered then return the number of
- * bytes which were copied.
- */
-size_t iov_iter_copy_from_user_atomic(struct page *page,
- struct iov_iter *i, unsigned long offset, size_t bytes)
-{
- char *kaddr;
- size_t copied;
-
- BUG_ON(!in_atomic());
- kaddr = kmap_atomic(page);
- if (likely(i->nr_segs == 1)) {
- int left;
- char __user *buf = i->iov->iov_base + i->iov_offset;
- left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
- copied = bytes - left;
- } else {
- copied = __iovec_copy_from_user_inatomic(kaddr + offset,
- i->iov, i->iov_offset, bytes);
- }
- kunmap_atomic(kaddr);
-
- return copied;
-}
-EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
-
-/*
- * This has the same sideeffects and return value as
- * iov_iter_copy_from_user_atomic().
- * The difference is that it attempts to resolve faults.
- * Page must not be locked.
- */
-size_t iov_iter_copy_from_user(struct page *page,
- struct iov_iter *i, unsigned long offset, size_t bytes)
-{
- char *kaddr;
- size_t copied;
-
- kaddr = kmap(page);
- if (likely(i->nr_segs == 1)) {
- int left;
- char __user *buf = i->iov->iov_base + i->iov_offset;
- left = __copy_from_user(kaddr + offset, buf, bytes);
- copied = bytes - left;
- } else {
- copied = __iovec_copy_from_user_inatomic(kaddr + offset,
- i->iov, i->iov_offset, bytes);
- }
- kunmap(page);
- return copied;
-}
-EXPORT_SYMBOL(iov_iter_copy_from_user);
-
-void iov_iter_advance(struct iov_iter *i, size_t bytes)
-{
- BUG_ON(i->count < bytes);
-
- if (likely(i->nr_segs == 1)) {
- i->iov_offset += bytes;
- i->count -= bytes;
- } else {
- const struct iovec *iov = i->iov;
- size_t base = i->iov_offset;
- unsigned long nr_segs = i->nr_segs;
-
- /*
- * The !iov->iov_len check ensures we skip over unlikely
- * zero-length segments (without overruning the iovec).
- */
- while (bytes || unlikely(i->count && !iov->iov_len)) {
- int copy;
-
- copy = min(bytes, iov->iov_len - base);
- BUG_ON(!i->count || i->count < copy);
- i->count -= copy;
- bytes -= copy;
- base += copy;
- if (iov->iov_len == base) {
- iov++;
- nr_segs--;
- base = 0;
- }
- }
- i->iov = iov;
- i->iov_offset = base;
- i->nr_segs = nr_segs;
- }
-}
-EXPORT_SYMBOL(iov_iter_advance);
-
-/*
- * Fault in the first iovec of the given iov_iter, to a maximum length
- * of bytes. Returns 0 on success, or non-zero if the memory could not be
- * accessed (ie. because it is an invalid address).
- *
- * writev-intensive code may want this to prefault several iovecs -- that
- * would be possible (callers must not rely on the fact that _only_ the
- * first iovec will be faulted with the current implementation).
- */
-int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
-{
- char __user *buf = i->iov->iov_base + i->iov_offset;
- bytes = min(bytes, i->iov->iov_len - i->iov_offset);
- return fault_in_pages_readable(buf, bytes);
-}
-EXPORT_SYMBOL(iov_iter_fault_in_readable);
-
-/*
- * Return the count of just the current iov_iter segment.
- */
-size_t iov_iter_single_seg_count(const struct iov_iter *i)
-{
- const struct iovec *iov = i->iov;
- if (i->nr_segs == 1)
- return i->count;
- else
- return min(i->count, iov->iov_len - i->iov_offset);
-}
-EXPORT_SYMBOL(iov_iter_single_seg_count);
-
/*
* Performs necessary checks before doing a write
*
@@ -2511,7 +2379,7 @@ EXPORT_SYMBOL(pagecache_write_end);
ssize_t
generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long *nr_segs, loff_t pos, loff_t *ppos,
+ unsigned long *nr_segs, loff_t pos,
size_t count, size_t ocount)
{
struct file *file = iocb->ki_filp;
@@ -2572,7 +2440,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
i_size_write(inode, pos);
mark_inode_dirty(inode);
}
- *ppos = pos;
+ iocb->ki_pos = pos;
}
out:
return written;
@@ -2618,7 +2486,7 @@ found:
}
EXPORT_SYMBOL(grab_cache_page_write_begin);
-static ssize_t generic_perform_write(struct file *file,
+ssize_t generic_perform_write(struct file *file,
struct iov_iter *i, loff_t pos)
{
struct address_space *mapping = file->f_mapping;
@@ -2668,9 +2536,7 @@ again:
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
- pagefault_disable();
copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
- pagefault_enable();
flush_dcache_page(page);
mark_page_accessed(page);
@@ -2708,27 +2574,7 @@ again:
return written ? written : status;
}
-
-ssize_t
-generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos, loff_t *ppos,
- size_t count, ssize_t written)
-{
- struct file *file = iocb->ki_filp;
- ssize_t status;
- struct iov_iter i;
-
- iov_iter_init(&i, iov, nr_segs, count, written);
- status = generic_perform_write(file, &i, pos);
-
- if (likely(status >= 0)) {
- written += status;
- *ppos = pos + status;
- }
-
- return written ? written : status;
-}
-EXPORT_SYMBOL(generic_file_buffered_write);
+EXPORT_SYMBOL(generic_perform_write);
/**
* __generic_file_aio_write - write data to a file
@@ -2750,16 +2596,18 @@ EXPORT_SYMBOL(generic_file_buffered_write);
* avoid syncing under i_mutex.
*/
ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t *ppos)
+ unsigned long nr_segs)
{
struct file *file = iocb->ki_filp;
struct address_space * mapping = file->f_mapping;
size_t ocount; /* original count */
size_t count; /* after file limit checks */
struct inode *inode = mapping->host;
- loff_t pos;
- ssize_t written;
+ loff_t pos = iocb->ki_pos;
+ ssize_t written = 0;
ssize_t err;
+ ssize_t status;
+ struct iov_iter from;
ocount = 0;
err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
@@ -2767,12 +2615,9 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
return err;
count = ocount;
- pos = *ppos;
/* We can write back this queue in page reclaim */
current->backing_dev_info = mapping->backing_dev_info;
- written = 0;
-
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err)
goto out;
@@ -2788,45 +2633,47 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (err)
goto out;
+ iov_iter_init(&from, iov, nr_segs, count, 0);
+
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
if (unlikely(file->f_flags & O_DIRECT)) {
loff_t endbyte;
- ssize_t written_buffered;
- written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
- ppos, count, ocount);
+ written = generic_file_direct_write(iocb, iov, &from.nr_segs, pos,
+ count, ocount);
if (written < 0 || written == count)
goto out;
+ iov_iter_advance(&from, written);
+
/*
* direct-io write to a hole: fall through to buffered I/O
* for completing the rest of the request.
*/
pos += written;
count -= written;
- written_buffered = generic_file_buffered_write(iocb, iov,
- nr_segs, pos, ppos, count,
- written);
+
+ status = generic_perform_write(file, &from, pos);
/*
- * If generic_file_buffered_write() retuned a synchronous error
+ * If generic_perform_write() returned a synchronous error
* then we want to return the number of bytes which were
* direct-written, or the error code if that was zero. Note
* that this differs from normal direct-io semantics, which
* will return -EFOO even if some bytes were written.
*/
- if (written_buffered < 0) {
- err = written_buffered;
+ if (unlikely(status < 0) && !written) {
+ err = status;
goto out;
}
-
+ iocb->ki_pos = pos + status;
/*
* We need to ensure that the page cache pages are written to
* disk and invalidated to preserve the expected O_DIRECT
* semantics.
*/
- endbyte = pos + written_buffered - written - 1;
+ endbyte = pos + status - 1;
err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
if (err == 0) {
- written = written_buffered;
+ written += status;
invalidate_mapping_pages(mapping,
pos >> PAGE_CACHE_SHIFT,
endbyte >> PAGE_CACHE_SHIFT);
@@ -2837,8 +2684,9 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
*/
}
} else {
- written = generic_file_buffered_write(iocb, iov, nr_segs,
- pos, ppos, count, written);
+ written = generic_perform_write(file, &from, pos);
+ if (likely(written >= 0))
+ iocb->ki_pos = pos + written;
}
out:
current->backing_dev_info = NULL;
@@ -2867,7 +2715,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
BUG_ON(iocb->ki_pos != pos);
mutex_lock(&inode->i_mutex);
- ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
+ ret = __generic_file_aio_write(iocb, iov, nr_segs);
mutex_unlock(&inode->i_mutex);
if (ret > 0) {