summaryrefslogtreecommitdiff
path: root/fs/read_write.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/read_write.c')
-rw-r--r--fs/read_write.c615
1 files changed, 152 insertions, 463 deletions
diff --git a/fs/read_write.c b/fs/read_write.c
index 19f5c4bf75aa..75f764b43418 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -419,27 +419,42 @@ static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo
return ret;
}
+static int warn_unsupported(struct file *file, const char *op)
+{
+ pr_warn_ratelimited(
+ "kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
+ op, file, current->pid, current->comm);
+ return -EINVAL;
+}
+
ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
{
- mm_segment_t old_fs = get_fs();
+ struct kvec iov = {
+ .iov_base = buf,
+ .iov_len = min_t(size_t, count, MAX_RW_COUNT),
+ };
+ struct kiocb kiocb;
+ struct iov_iter iter;
ssize_t ret;
if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ)))
return -EINVAL;
if (!(file->f_mode & FMODE_CAN_READ))
return -EINVAL;
+ /*
+ * Also fail if ->read_iter and ->read are both wired up as that
+ * implies very convoluted semantics.
+ */
+ if (unlikely(!file->f_op->read_iter || file->f_op->read))
+ return warn_unsupported(file, "read");
- if (count > MAX_RW_COUNT)
- count = MAX_RW_COUNT;
- set_fs(KERNEL_DS);
- if (file->f_op->read)
- ret = file->f_op->read(file, (void __user *)buf, count, pos);
- else if (file->f_op->read_iter)
- ret = new_sync_read(file, (void __user *)buf, count, pos);
- else
- ret = -EINVAL;
- set_fs(old_fs);
+ init_sync_kiocb(&kiocb, file);
+ kiocb.ki_pos = pos ? *pos : 0;
+ iov_iter_kvec(&iter, READ, &iov, 1, iov.iov_len);
+ ret = file->f_op->read_iter(&kiocb, &iter);
if (ret > 0) {
+ if (pos)
+ *pos = kiocb.ki_pos;
fsnotify_access(file);
add_rchar(current, ret);
}
@@ -510,28 +525,32 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
/* caller is responsible for file_start_write/file_end_write */
ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
{
- mm_segment_t old_fs;
- const char __user *p;
+ struct kvec iov = {
+ .iov_base = (void *)buf,
+ .iov_len = min_t(size_t, count, MAX_RW_COUNT),
+ };
+ struct kiocb kiocb;
+ struct iov_iter iter;
ssize_t ret;
if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
return -EBADF;
if (!(file->f_mode & FMODE_CAN_WRITE))
return -EINVAL;
+ /*
+ * Also fail if ->write_iter and ->write are both wired up as that
+ * implies very convoluted semantics.
+ */
+ if (unlikely(!file->f_op->write_iter || file->f_op->write))
+ return warn_unsupported(file, "write");
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- p = (__force const char __user *)buf;
- if (count > MAX_RW_COUNT)
- count = MAX_RW_COUNT;
- if (file->f_op->write)
- ret = file->f_op->write(file, p, count, pos);
- else if (file->f_op->write_iter)
- ret = new_sync_write(file, p, count, pos);
- else
- ret = -EINVAL;
- set_fs(old_fs);
+ init_sync_kiocb(&kiocb, file);
+ kiocb.ki_pos = pos ? *pos : 0;
+ iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len);
+ ret = file->f_op->write_iter(&kiocb, &iter);
if (ret > 0) {
+ if (pos)
+ *pos = kiocb.ki_pos;
fsnotify_modify(file);
add_wchar(current, ret);
}
@@ -889,7 +908,7 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
}
EXPORT_SYMBOL(vfs_iter_write);
-ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
+static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
unsigned long vlen, loff_t *pos, rwf_t flags)
{
struct iovec iovstack[UIO_FASTIOV];
@@ -1392,6 +1411,59 @@ static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
}
/*
+ * Performs necessary checks before doing a file copy
+ *
+ * Can adjust amount of bytes to copy via @req_count argument.
+ * Returns appropriate error code that caller should return or
+ * zero in case the copy should be allowed.
+ */
+static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t *req_count, unsigned int flags)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
+ uint64_t count = *req_count;
+ loff_t size_in;
+ int ret;
+
+ ret = generic_file_rw_checks(file_in, file_out);
+ if (ret)
+ return ret;
+
+ /* Don't touch certain kinds of inodes */
+ if (IS_IMMUTABLE(inode_out))
+ return -EPERM;
+
+ if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
+ return -ETXTBSY;
+
+ /* Ensure offsets don't wrap. */
+ if (pos_in + count < pos_in || pos_out + count < pos_out)
+ return -EOVERFLOW;
+
+ /* Shorten the copy to EOF */
+ size_in = i_size_read(inode_in);
+ if (pos_in >= size_in)
+ count = 0;
+ else
+ count = min(count, size_in - (uint64_t)pos_in);
+
+ ret = generic_write_check_limits(file_out, pos_out, &count);
+ if (ret)
+ return ret;
+
+ /* Don't allow overlapped copying within the same file. */
+ if (inode_in == inode_out &&
+ pos_out + count > pos_in &&
+ pos_out < pos_in + count)
+ return -EINVAL;
+
+ *req_count = count;
+ return 0;
+}
+
+/*
* copy_file_range() differs from regular file read and write in that it
* specifically allows return partial success. When it does so is up to
* the copy_file_range method.
@@ -1523,475 +1595,92 @@ out2:
return ret;
}
-static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
- bool write)
-{
- struct inode *inode = file_inode(file);
-
- if (unlikely(pos < 0 || len < 0))
- return -EINVAL;
-
- if (unlikely((loff_t) (pos + len) < 0))
- return -EINVAL;
-
- if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
- loff_t end = len ? pos + len - 1 : OFFSET_MAX;
- int retval;
-
- retval = locks_mandatory_area(inode, file, pos, end,
- write ? F_WRLCK : F_RDLCK);
- if (retval < 0)
- return retval;
- }
-
- return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
-}
-/*
- * Ensure that we don't remap a partial EOF block in the middle of something
- * else. Assume that the offsets have already been checked for block
- * alignment.
- *
- * For clone we only link a partial EOF block above or at the destination file's
- * EOF. For deduplication we accept a partial EOF block only if it ends at the
- * destination file's EOF (can not link it into the middle of a file).
- *
- * Shorten the request if possible.
- */
-static int generic_remap_check_len(struct inode *inode_in,
- struct inode *inode_out,
- loff_t pos_out,
- loff_t *len,
- unsigned int remap_flags)
-{
- u64 blkmask = i_blocksize(inode_in) - 1;
- loff_t new_len = *len;
-
- if ((*len & blkmask) == 0)
- return 0;
-
- if (pos_out + *len < i_size_read(inode_out))
- new_len &= ~blkmask;
-
- if (new_len == *len)
- return 0;
-
- if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
- *len = new_len;
- return 0;
- }
-
- return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
-}
-
-/* Read a page's worth of file data into the page cache. */
-static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
-{
- struct page *page;
-
- page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL);
- if (IS_ERR(page))
- return page;
- if (!PageUptodate(page)) {
- put_page(page);
- return ERR_PTR(-EIO);
- }
- return page;
-}
-
/*
- * Lock two pages, ensuring that we lock in offset order if the pages are from
- * the same file.
+ * Don't operate on ranges the page cache doesn't support, and don't exceed the
+ * LFS limits. If pos is under the limit it becomes a short access. If it
+ * exceeds the limit we return -EFBIG.
*/
-static void vfs_lock_two_pages(struct page *page1, struct page *page2)
+int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count)
{
- /* Always lock in order of increasing index. */
- if (page1->index > page2->index)
- swap(page1, page2);
-
- lock_page(page1);
- if (page1 != page2)
- lock_page(page2);
-}
-
-/* Unlock two pages, being careful not to unlock the same page twice. */
-static void vfs_unlock_two_pages(struct page *page1, struct page *page2)
-{
- unlock_page(page1);
- if (page1 != page2)
- unlock_page(page2);
-}
-
-/*
- * Compare extents of two files to see if they are the same.
- * Caller must have locked both inodes to prevent write races.
- */
-static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
- struct inode *dest, loff_t destoff,
- loff_t len, bool *is_same)
-{
- loff_t src_poff;
- loff_t dest_poff;
- void *src_addr;
- void *dest_addr;
- struct page *src_page;
- struct page *dest_page;
- loff_t cmp_len;
- bool same;
- int error;
-
- error = -EINVAL;
- same = true;
- while (len) {
- src_poff = srcoff & (PAGE_SIZE - 1);
- dest_poff = destoff & (PAGE_SIZE - 1);
- cmp_len = min(PAGE_SIZE - src_poff,
- PAGE_SIZE - dest_poff);
- cmp_len = min(cmp_len, len);
- if (cmp_len <= 0)
- goto out_error;
-
- src_page = vfs_dedupe_get_page(src, srcoff);
- if (IS_ERR(src_page)) {
- error = PTR_ERR(src_page);
- goto out_error;
- }
- dest_page = vfs_dedupe_get_page(dest, destoff);
- if (IS_ERR(dest_page)) {
- error = PTR_ERR(dest_page);
- put_page(src_page);
- goto out_error;
- }
-
- vfs_lock_two_pages(src_page, dest_page);
+ struct inode *inode = file->f_mapping->host;
+ loff_t max_size = inode->i_sb->s_maxbytes;
+ loff_t limit = rlimit(RLIMIT_FSIZE);
- /*
- * Now that we've locked both pages, make sure they're still
- * mapped to the file data we're interested in. If not,
- * someone is invalidating pages on us and we lose.
- */
- if (!PageUptodate(src_page) || !PageUptodate(dest_page) ||
- src_page->mapping != src->i_mapping ||
- dest_page->mapping != dest->i_mapping) {
- same = false;
- goto unlock;
+ if (limit != RLIM_INFINITY) {
+ if (pos >= limit) {
+ send_sig(SIGXFSZ, current, 0);
+ return -EFBIG;
}
+ *count = min(*count, limit - pos);
+ }
- src_addr = kmap_atomic(src_page);
- dest_addr = kmap_atomic(dest_page);
-
- flush_dcache_page(src_page);
- flush_dcache_page(dest_page);
-
- if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
- same = false;
+ if (!(file->f_flags & O_LARGEFILE))
+ max_size = MAX_NON_LFS;
- kunmap_atomic(dest_addr);
- kunmap_atomic(src_addr);
-unlock:
- vfs_unlock_two_pages(src_page, dest_page);
- put_page(dest_page);
- put_page(src_page);
+ if (unlikely(pos >= max_size))
+ return -EFBIG;
- if (!same)
- break;
+ *count = min(*count, max_size - pos);
- srcoff += cmp_len;
- destoff += cmp_len;
- len -= cmp_len;
- }
-
- *is_same = same;
return 0;
-
-out_error:
- return error;
}
/*
- * Check that the two inodes are eligible for cloning, the ranges make
- * sense, and then flush all dirty data. Caller must ensure that the
- * inodes have been locked against any other modifications.
+ * Performs necessary checks before doing a write
*
- * If there's an error, then the usual negative error code is returned.
- * Otherwise returns 0 with *len set to the request length.
+ * Can adjust writing position or amount of bytes to write.
+ * Returns appropriate error code that caller should return or
+ * zero in case that write should be allowed.
*/
-int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *len, unsigned int remap_flags)
+ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
{
- struct inode *inode_in = file_inode(file_in);
- struct inode *inode_out = file_inode(file_out);
- bool same_inode = (inode_in == inode_out);
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ loff_t count;
int ret;
- /* Don't touch certain kinds of inodes */
- if (IS_IMMUTABLE(inode_out))
- return -EPERM;
-
- if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
+ if (IS_SWAPFILE(inode))
return -ETXTBSY;
- /* Don't reflink dirs, pipes, sockets... */
- if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
- return -EISDIR;
- if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
- return -EINVAL;
-
- /* Zero length dedupe exits immediately; reflink goes to EOF. */
- if (*len == 0) {
- loff_t isize = i_size_read(inode_in);
-
- if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
- return 0;
- if (pos_in > isize)
- return -EINVAL;
- *len = isize - pos_in;
- if (*len == 0)
- return 0;
- }
-
- /* Check that we don't violate system file offset limits. */
- ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
- remap_flags);
- if (ret)
- return ret;
-
- /* Wait for the completion of any pending IOs on both files */
- inode_dio_wait(inode_in);
- if (!same_inode)
- inode_dio_wait(inode_out);
-
- ret = filemap_write_and_wait_range(inode_in->i_mapping,
- pos_in, pos_in + *len - 1);
- if (ret)
- return ret;
-
- ret = filemap_write_and_wait_range(inode_out->i_mapping,
- pos_out, pos_out + *len - 1);
- if (ret)
- return ret;
-
- /*
- * Check that the extents are the same.
- */
- if (remap_flags & REMAP_FILE_DEDUP) {
- bool is_same = false;
-
- ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
- inode_out, pos_out, *len, &is_same);
- if (ret)
- return ret;
- if (!is_same)
- return -EBADE;
- }
-
- ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
- remap_flags);
- if (ret)
- return ret;
-
- /* If can't alter the file contents, we're done. */
- if (!(remap_flags & REMAP_FILE_DEDUP))
- ret = file_modified(file_out);
-
- return ret;
-}
-EXPORT_SYMBOL(generic_remap_file_range_prep);
-
-loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t len, unsigned int remap_flags)
-{
- loff_t ret;
-
- WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP);
-
- /*
- * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
- * the same mount. Practically, they only need to be on the same file
- * system.
- */
- if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
- return -EXDEV;
-
- ret = generic_file_rw_checks(file_in, file_out);
- if (ret < 0)
- return ret;
-
- if (!file_in->f_op->remap_file_range)
- return -EOPNOTSUPP;
-
- ret = remap_verify_area(file_in, pos_in, len, false);
- if (ret)
- return ret;
-
- ret = remap_verify_area(file_out, pos_out, len, true);
- if (ret)
- return ret;
-
- ret = file_in->f_op->remap_file_range(file_in, pos_in,
- file_out, pos_out, len, remap_flags);
- if (ret < 0)
- return ret;
-
- fsnotify_access(file_in);
- fsnotify_modify(file_out);
- return ret;
-}
-EXPORT_SYMBOL(do_clone_file_range);
-
-loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t len, unsigned int remap_flags)
-{
- loff_t ret;
-
- file_start_write(file_out);
- ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len,
- remap_flags);
- file_end_write(file_out);
-
- return ret;
-}
-EXPORT_SYMBOL(vfs_clone_file_range);
-
-/* Check whether we are allowed to dedupe the destination file */
-static bool allow_file_dedupe(struct file *file)
-{
- if (capable(CAP_SYS_ADMIN))
- return true;
- if (file->f_mode & FMODE_WRITE)
- return true;
- if (uid_eq(current_fsuid(), file_inode(file)->i_uid))
- return true;
- if (!inode_permission(file_inode(file), MAY_WRITE))
- return true;
- return false;
-}
+ if (!iov_iter_count(from))
+ return 0;
-loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
- struct file *dst_file, loff_t dst_pos,
- loff_t len, unsigned int remap_flags)
-{
- loff_t ret;
+ /* FIXME: this is for backwards compatibility with 2.4 */
+ if (iocb->ki_flags & IOCB_APPEND)
+ iocb->ki_pos = i_size_read(inode);
- WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP |
- REMAP_FILE_CAN_SHORTEN));
+ if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
+ return -EINVAL;
- ret = mnt_want_write_file(dst_file);
+ count = iov_iter_count(from);
+ ret = generic_write_check_limits(file, iocb->ki_pos, &count);
if (ret)
return ret;
- ret = remap_verify_area(dst_file, dst_pos, len, true);
- if (ret < 0)
- goto out_drop_write;
-
- ret = -EPERM;
- if (!allow_file_dedupe(dst_file))
- goto out_drop_write;
-
- ret = -EXDEV;
- if (src_file->f_path.mnt != dst_file->f_path.mnt)
- goto out_drop_write;
-
- ret = -EISDIR;
- if (S_ISDIR(file_inode(dst_file)->i_mode))
- goto out_drop_write;
-
- ret = -EINVAL;
- if (!dst_file->f_op->remap_file_range)
- goto out_drop_write;
-
- if (len == 0) {
- ret = 0;
- goto out_drop_write;
- }
-
- ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file,
- dst_pos, len, remap_flags | REMAP_FILE_DEDUP);
-out_drop_write:
- mnt_drop_write_file(dst_file);
-
- return ret;
+ iov_iter_truncate(from, count);
+ return iov_iter_count(from);
}
-EXPORT_SYMBOL(vfs_dedupe_file_range_one);
+EXPORT_SYMBOL(generic_write_checks);
-int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
+/*
+ * Performs common checks before doing a file copy/clone
+ * from @file_in to @file_out.
+ */
+int generic_file_rw_checks(struct file *file_in, struct file *file_out)
{
- struct file_dedupe_range_info *info;
- struct inode *src = file_inode(file);
- u64 off;
- u64 len;
- int i;
- int ret;
- u16 count = same->dest_count;
- loff_t deduped;
-
- if (!(file->f_mode & FMODE_READ))
- return -EINVAL;
-
- if (same->reserved1 || same->reserved2)
- return -EINVAL;
-
- off = same->src_offset;
- len = same->src_length;
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
- if (S_ISDIR(src->i_mode))
+ /* Don't copy dirs, pipes, sockets... */
+ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
return -EISDIR;
-
- if (!S_ISREG(src->i_mode))
- return -EINVAL;
-
- if (!file->f_op->remap_file_range)
- return -EOPNOTSUPP;
-
- ret = remap_verify_area(file, off, len, false);
- if (ret < 0)
- return ret;
- ret = 0;
-
- if (off + len > i_size_read(src))
+ if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
return -EINVAL;
- /* Arbitrary 1G limit on a single dedupe request, can be raised. */
- len = min_t(u64, len, 1 << 30);
-
- /* pre-format output fields to sane values */
- for (i = 0; i < count; i++) {
- same->info[i].bytes_deduped = 0ULL;
- same->info[i].status = FILE_DEDUPE_RANGE_SAME;
- }
-
- for (i = 0, info = same->info; i < count; i++, info++) {
- struct fd dst_fd = fdget(info->dest_fd);
- struct file *dst_file = dst_fd.file;
-
- if (!dst_file) {
- info->status = -EBADF;
- goto next_loop;
- }
-
- if (info->reserved) {
- info->status = -EINVAL;
- goto next_fdput;
- }
-
- deduped = vfs_dedupe_file_range_one(file, off, dst_file,
- info->dest_offset, len,
- REMAP_FILE_CAN_SHORTEN);
- if (deduped == -EBADE)
- info->status = FILE_DEDUPE_RANGE_DIFFERS;
- else if (deduped < 0)
- info->status = deduped;
- else
- info->bytes_deduped = len;
+ if (!(file_in->f_mode & FMODE_READ) ||
+ !(file_out->f_mode & FMODE_WRITE) ||
+ (file_out->f_flags & O_APPEND))
+ return -EBADF;
-next_fdput:
- fdput(dst_fd);
-next_loop:
- if (fatal_signal_pending(current))
- break;
- }
- return ret;
+ return 0;
}
-EXPORT_SYMBOL(vfs_dedupe_file_range);