summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-11-02 22:25:03 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2021-11-02 22:25:03 +0300
commitc03098d4b9ad76bca2966a8769dcfe59f7f85103 (patch)
treee7e2a6a0a84ad29baa14c018e3d4dcb12bd08fd6 /lib
parentab2e7f4b46bf8fccf088ec496b3bb26b43e91340 (diff)
parentb01b2d72da25c000aeb124bc78daf3fb998be2b6 (diff)
downloadlinux-c03098d4b9ad76bca2966a8769dcfe59f7f85103.tar.xz
Merge tag 'gfs2-v5.15-rc5-mmap-fault' of git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2
Pull gfs2 mmap + page fault deadlocks fixes from Andreas Gruenbacher: "Functions gfs2_file_read_iter and gfs2_file_write_iter are both accessing the user buffer to write to or read from while holding the inode glock. In the most basic deadlock scenario, that buffer will not be resident and it will be mapped to the same file. Accessing the buffer will trigger a page fault, and gfs2 will deadlock trying to take the same inode glock again while trying to handle that fault. Fix that and similar, more complex scenarios by disabling page faults while accessing user buffers. To make this work, introduce a small amount of new infrastructure and fix some bugs that didn't trigger so far, with page faults enabled" * tag 'gfs2-v5.15-rc5-mmap-fault' of git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2: gfs2: Fix mmap + page fault deadlocks for direct I/O iov_iter: Introduce nofault flag to disable page faults gup: Introduce FOLL_NOFAULT flag to disable page faults iomap: Add done_before argument to iomap_dio_rw iomap: Support partial direct I/O on user copy failures iomap: Fix iomap_dio_rw return value for user copies gfs2: Fix mmap + page fault deadlocks for buffered I/O gfs2: Eliminate ip->i_gh gfs2: Move the inode glock locking to gfs2_file_buffered_write gfs2: Introduce flag for glock holder auto-demotion gfs2: Clean up function may_grant gfs2: Add wrapper for iomap_file_buffered_write iov_iter: Introduce fault_in_iov_iter_writeable iov_iter: Turn iov_iter_fault_in_readable into fault_in_iov_iter_readable gup: Turn fault_in_pages_{readable,writeable} into fault_in_{readable,writeable} powerpc/kvm: Fix kvm_use_magic_page iov_iter: Fix iov_iter_get_pages{,_alloc} page fault return value
Diffstat (limited to 'lib')
-rw-r--r--lib/iov_iter.c103
1 files changed, 80 insertions, 23 deletions
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 755c10c5138c..66a740e6e153 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -191,7 +191,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b
buf = iov->iov_base + skip;
copy = min(bytes, iov->iov_len - skip);
- if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) {
+ if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_writeable(buf, copy)) {
kaddr = kmap_atomic(page);
from = kaddr + offset;
@@ -275,7 +275,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t
buf = iov->iov_base + skip;
copy = min(bytes, iov->iov_len - skip);
- if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) {
+ if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_readable(buf, copy)) {
kaddr = kmap_atomic(page);
to = kaddr + offset;
@@ -430,35 +430,81 @@ out:
}
/*
+ * fault_in_iov_iter_readable - fault in iov iterator for reading
+ * @i: iterator
+ * @size: maximum length
+ *
* Fault in one or more iovecs of the given iov_iter, to a maximum length of
- * bytes. For each iovec, fault in each page that constitutes the iovec.
+ * @size. For each iovec, fault in each page that constitutes the iovec.
+ *
+ * Returns the number of bytes not faulted in (like copy_to_user() and
+ * copy_from_user()).
*
- * Return 0 on success, or non-zero if the memory could not be accessed (i.e.
- * because it is an invalid address).
+ * Always returns 0 for non-userspace iterators.
*/
-int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes)
+size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size)
{
if (iter_is_iovec(i)) {
+ size_t count = min(size, iov_iter_count(i));
const struct iovec *p;
size_t skip;
- if (bytes > i->count)
- bytes = i->count;
- for (p = i->iov, skip = i->iov_offset; bytes; p++, skip = 0) {
- size_t len = min(bytes, p->iov_len - skip);
- int err;
+ size -= count;
+ for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) {
+ size_t len = min(count, p->iov_len - skip);
+ size_t ret;
if (unlikely(!len))
continue;
- err = fault_in_pages_readable(p->iov_base + skip, len);
- if (unlikely(err))
- return err;
- bytes -= len;
+ ret = fault_in_readable(p->iov_base + skip, len);
+ count -= len - ret;
+ if (ret)
+ break;
}
+ return count + size;
}
return 0;
}
-EXPORT_SYMBOL(iov_iter_fault_in_readable);
+EXPORT_SYMBOL(fault_in_iov_iter_readable);
+
+/*
+ * fault_in_iov_iter_writeable - fault in iov iterator for writing
+ * @i: iterator
+ * @size: maximum length
+ *
+ * Faults in the iterator using get_user_pages(), i.e., without triggering
+ * hardware page faults. This is primarily useful when we already know that
+ * some or all of the pages in @i aren't in memory.
+ *
+ * Returns the number of bytes not faulted in, like copy_to_user() and
+ * copy_from_user().
+ *
+ * Always returns 0 for non-user-space iterators.
+ */
+size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size)
+{
+ if (iter_is_iovec(i)) {
+ size_t count = min(size, iov_iter_count(i));
+ const struct iovec *p;
+ size_t skip;
+
+ size -= count;
+ for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) {
+ size_t len = min(count, p->iov_len - skip);
+ size_t ret;
+
+ if (unlikely(!len))
+ continue;
+ ret = fault_in_safe_writeable(p->iov_base + skip, len);
+ count -= len - ret;
+ if (ret)
+ break;
+ }
+ return count + size;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(fault_in_iov_iter_writeable);
void iov_iter_init(struct iov_iter *i, unsigned int direction,
const struct iovec *iov, unsigned long nr_segs,
@@ -467,6 +513,7 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
WARN_ON(direction & ~(READ | WRITE));
*i = (struct iov_iter) {
.iter_type = ITER_IOVEC,
+ .nofault = false,
.data_source = direction,
.iov = iov,
.nr_segs = nr_segs,
@@ -1481,14 +1528,18 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
return 0;
if (likely(iter_is_iovec(i))) {
+ unsigned int gup_flags = 0;
unsigned long addr;
+ if (iov_iter_rw(i) != WRITE)
+ gup_flags |= FOLL_WRITE;
+ if (i->nofault)
+ gup_flags |= FOLL_NOFAULT;
+
addr = first_iovec_segment(i, &len, start, maxsize, maxpages);
n = DIV_ROUND_UP(len, PAGE_SIZE);
- res = get_user_pages_fast(addr, n,
- iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0,
- pages);
- if (unlikely(res < 0))
+ res = get_user_pages_fast(addr, n, gup_flags, pages);
+ if (unlikely(res <= 0))
return res;
return (res == n ? len : res * PAGE_SIZE) - *start;
}
@@ -1603,17 +1654,23 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
return 0;
if (likely(iter_is_iovec(i))) {
+ unsigned int gup_flags = 0;
unsigned long addr;
+ if (iov_iter_rw(i) != WRITE)
+ gup_flags |= FOLL_WRITE;
+ if (i->nofault)
+ gup_flags |= FOLL_NOFAULT;
+
addr = first_iovec_segment(i, &len, start, maxsize, ~0U);
n = DIV_ROUND_UP(len, PAGE_SIZE);
p = get_pages_array(n);
if (!p)
return -ENOMEM;
- res = get_user_pages_fast(addr, n,
- iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, p);
- if (unlikely(res < 0)) {
+ res = get_user_pages_fast(addr, n, gup_flags, p);
+ if (unlikely(res <= 0)) {
kvfree(p);
+ *pages = NULL;
return res;
}
*pages = p;