diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-11-02 19:33:08 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-11-02 19:33:08 +0300 |
commit | c2aa1a444cab2c673650ada80a7dffc4345ce2e6 (patch) | |
tree | 3efb7e2213cabd174780b021a8dab2cea0b03386 /fs/read_write.c | |
parent | b69f9e17a57a50bc34d88975afce4425086e525d (diff) | |
parent | bf4a1fcf0bc18d52cf0fce6571d6f327ab5eaf22 (diff) | |
download | linux-c2aa1a444cab2c673650ada80a7dffc4345ce2e6.tar.xz |
Merge tag 'xfs-4.20-merge-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux
Pull vfs dedup fixes from Dave Chinner:
"This reworks the vfs data cloning infrastructure.
We discovered many issues with these interfaces late in the 4.19 cycle
- the worst of them (data corruption, setuid stripping) were fixed for
XFS in 4.19-rc8, but a larger rework of the infrastructure fixing all
the problems was needed. That rework is the contents of this pull
request.
Rework the vfs_clone_file_range and vfs_dedupe_file_range
infrastructure to use a common .remap_file_range method and supply
generic bounds and sanity checking functions that are shared with the
data write path. The current VFS infrastructure has problems with
rlimit, LFS file sizes, file time stamps, maximum filesystem file
sizes, stripping setuid bits, etc and so they are addressed in these
commits.
We also introduce the ability for the ->remap_file_range methods to
return short clones so that clones for vfs_copy_file_range() don't get
rejected if the entire range can't be cloned. It also allows
filesystems to sliently skip deduplication of partial EOF blocks if
they are not capable of doing so without requiring errors to be thrown
to userspace.
Existing filesystems are converted to user the new remap_file_range
method, and both XFS and ocfs2 are modified to make use of the new
generic checking infrastructure"
* tag 'xfs-4.20-merge-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (28 commits)
xfs: remove [cm]time update from reflink calls
xfs: remove xfs_reflink_remap_range
xfs: remove redundant remap partial EOF block checks
xfs: support returning partial reflink results
xfs: clean up xfs_reflink_remap_blocks call site
xfs: fix pagecache truncation prior to reflink
ocfs2: remove ocfs2_reflink_remap_range
ocfs2: support partial clone range and dedupe range
ocfs2: fix pagecache truncation prior to reflink
ocfs2: truncate page cache for clone destination file before remapping
vfs: clean up generic_remap_file_range_prep return value
vfs: hide file range comparison function
vfs: enable remap callers that can handle short operations
vfs: plumb remap flags through the vfs dedupe functions
vfs: plumb remap flags through the vfs clone functions
vfs: make remap_file_range functions take and return bytes completed
vfs: remap helper should update destination inode metadata
vfs: pass remap flags to generic_remap_checks
vfs: pass remap flags to generic_remap_file_range_prep
vfs: combine the clone and dedupe into a single remap_file_range
...
Diffstat (limited to 'fs/read_write.c')
-rw-r--r-- | fs/read_write.c | 405 |
1 files changed, 226 insertions, 179 deletions
diff --git a/fs/read_write.c b/fs/read_write.c index 5a2ee488c5d2..bfcb4ced5664 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1587,11 +1587,15 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, * Try cloning first, this is supported by more file systems, and * more efficient if both clone and copy are supported (e.g. NFS). */ - if (file_in->f_op->clone_file_range) { - ret = file_in->f_op->clone_file_range(file_in, pos_in, - file_out, pos_out, len); - if (ret == 0) { - ret = len; + if (file_in->f_op->remap_file_range) { + loff_t cloned; + + cloned = file_in->f_op->remap_file_range(file_in, pos_in, + file_out, pos_out, + min_t(loff_t, MAX_RW_COUNT, len), + REMAP_FILE_CAN_SHORTEN); + if (cloned > 0) { + ret = cloned; goto done; } } @@ -1685,11 +1689,12 @@ out2: return ret; } -static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) +static int remap_verify_area(struct file *file, loff_t pos, loff_t len, + bool write) { struct inode *inode = file_inode(file); - if (unlikely(pos < 0)) + if (unlikely(pos < 0 || len < 0)) return -EINVAL; if (unlikely((loff_t) (pos + len) < 0)) @@ -1707,22 +1712,150 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) return security_file_permission(file, write ? MAY_WRITE : MAY_READ); } +/* + * Ensure that we don't remap a partial EOF block in the middle of something + * else. Assume that the offsets have already been checked for block + * alignment. + * + * For deduplication we always scale down to the previous block because we + * can't meaningfully compare post-EOF contents. + * + * For clone we only link a partial EOF block above the destination file's EOF. + * + * Shorten the request if possible. + */ +static int generic_remap_check_len(struct inode *inode_in, + struct inode *inode_out, + loff_t pos_out, + loff_t *len, + unsigned int remap_flags) +{ + u64 blkmask = i_blocksize(inode_in) - 1; + loff_t new_len = *len; + + if ((*len & blkmask) == 0) + return 0; + + if ((remap_flags & REMAP_FILE_DEDUP) || + pos_out + *len < i_size_read(inode_out)) + new_len &= ~blkmask; + + if (new_len == *len) + return 0; + + if (remap_flags & REMAP_FILE_CAN_SHORTEN) { + *len = new_len; + return 0; + } + + return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; +} + +/* + * Read a page's worth of file data into the page cache. Return the page + * locked. + */ +static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset) +{ + struct page *page; + + page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL); + if (IS_ERR(page)) + return page; + if (!PageUptodate(page)) { + put_page(page); + return ERR_PTR(-EIO); + } + lock_page(page); + return page; +} + +/* + * Compare extents of two files to see if they are the same. + * Caller must have locked both inodes to prevent write races. + */ +static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, + struct inode *dest, loff_t destoff, + loff_t len, bool *is_same) +{ + loff_t src_poff; + loff_t dest_poff; + void *src_addr; + void *dest_addr; + struct page *src_page; + struct page *dest_page; + loff_t cmp_len; + bool same; + int error; + + error = -EINVAL; + same = true; + while (len) { + src_poff = srcoff & (PAGE_SIZE - 1); + dest_poff = destoff & (PAGE_SIZE - 1); + cmp_len = min(PAGE_SIZE - src_poff, + PAGE_SIZE - dest_poff); + cmp_len = min(cmp_len, len); + if (cmp_len <= 0) + goto out_error; + + src_page = vfs_dedupe_get_page(src, srcoff); + if (IS_ERR(src_page)) { + error = PTR_ERR(src_page); + goto out_error; + } + dest_page = vfs_dedupe_get_page(dest, destoff); + if (IS_ERR(dest_page)) { + error = PTR_ERR(dest_page); + unlock_page(src_page); + put_page(src_page); + goto out_error; + } + src_addr = kmap_atomic(src_page); + dest_addr = kmap_atomic(dest_page); + + flush_dcache_page(src_page); + flush_dcache_page(dest_page); + + if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) + same = false; + + kunmap_atomic(dest_addr); + kunmap_atomic(src_addr); + unlock_page(dest_page); + unlock_page(src_page); + put_page(dest_page); + put_page(src_page); + + if (!same) + break; + + srcoff += cmp_len; + destoff += cmp_len; + len -= cmp_len; + } + + *is_same = same; + return 0; + +out_error: + return error; +} /* * Check that the two inodes are eligible for cloning, the ranges make * sense, and then flush all dirty data. Caller must ensure that the * inodes have been locked against any other modifications. * - * Returns: 0 for "nothing to clone", 1 for "something to clone", or - * the usual negative error code. + * If there's an error, then the usual negative error code is returned. + * Otherwise returns 0 with *len set to the request length. */ -int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, - struct inode *inode_out, loff_t pos_out, - u64 *len, bool is_dedupe) +int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags) { - loff_t bs = inode_out->i_sb->s_blocksize; - loff_t blen; - loff_t isize; + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); bool same_inode = (inode_in == inode_out); int ret; @@ -1739,50 +1872,24 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) return -EINVAL; - /* Are we going all the way to the end? */ - isize = i_size_read(inode_in); - if (isize == 0) - return 0; - /* Zero length dedupe exits immediately; reflink goes to EOF. */ if (*len == 0) { - if (is_dedupe || pos_in == isize) + loff_t isize = i_size_read(inode_in); + + if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize) return 0; if (pos_in > isize) return -EINVAL; *len = isize - pos_in; + if (*len == 0) + return 0; } - /* Ensure offsets don't wrap and the input is inside i_size */ - if (pos_in + *len < pos_in || pos_out + *len < pos_out || - pos_in + *len > isize) - return -EINVAL; - - /* Don't allow dedupe past EOF in the dest file */ - if (is_dedupe) { - loff_t disize; - - disize = i_size_read(inode_out); - if (pos_out >= disize || pos_out + *len > disize) - return -EINVAL; - } - - /* If we're linking to EOF, continue to the block boundary. */ - if (pos_in + *len == isize) - blen = ALIGN(isize, bs) - pos_in; - else - blen = *len; - - /* Only reflink if we're aligned to block boundaries */ - if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) || - !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs)) - return -EINVAL; - - /* Don't allow overlapped reflink within the same file */ - if (same_inode) { - if (pos_out + blen > pos_in && pos_out < pos_in + blen) - return -EINVAL; - } + /* Check that we don't violate system file offset limits. */ + ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, + remap_flags); + if (ret) + return ret; /* Wait for the completion of any pending IOs on both files */ inode_dio_wait(inode_in); @@ -1802,7 +1909,7 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, /* * Check that the extents are the same. */ - if (is_dedupe) { + if (remap_flags & REMAP_FILE_DEDUP) { bool is_same = false; ret = vfs_dedupe_file_range_compare(inode_in, pos_in, @@ -1813,16 +1920,43 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, return -EBADE; } - return 1; + ret = generic_remap_check_len(inode_in, inode_out, pos_out, len, + remap_flags); + if (ret) + return ret; + + /* If can't alter the file contents, we're done. */ + if (!(remap_flags & REMAP_FILE_DEDUP)) { + /* Update the timestamps, since we can alter file contents. */ + if (!(file_out->f_mode & FMODE_NOCMTIME)) { + ret = file_update_time(file_out); + if (ret) + return ret; + } + + /* + * Clear the security bits if the process is not being run by + * root. This keeps people from modifying setuid and setgid + * binaries. + */ + ret = file_remove_privs(file_out); + if (ret) + return ret; + } + + return 0; } -EXPORT_SYMBOL(vfs_clone_file_prep_inodes); +EXPORT_SYMBOL(generic_remap_file_range_prep); -int do_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len) +loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); - int ret; + loff_t ret; + + WARN_ON_ONCE(remap_flags); if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) return -EISDIR; @@ -1842,140 +1976,43 @@ int do_clone_file_range(struct file *file_in, loff_t pos_in, (file_out->f_flags & O_APPEND)) return -EBADF; - if (!file_in->f_op->clone_file_range) + if (!file_in->f_op->remap_file_range) return -EOPNOTSUPP; - ret = clone_verify_area(file_in, pos_in, len, false); + ret = remap_verify_area(file_in, pos_in, len, false); if (ret) return ret; - ret = clone_verify_area(file_out, pos_out, len, true); + ret = remap_verify_area(file_out, pos_out, len, true); if (ret) return ret; - if (pos_in + len > i_size_read(inode_in)) - return -EINVAL; - - ret = file_in->f_op->clone_file_range(file_in, pos_in, - file_out, pos_out, len); - if (!ret) { - fsnotify_access(file_in); - fsnotify_modify(file_out); - } + ret = file_in->f_op->remap_file_range(file_in, pos_in, + file_out, pos_out, len, remap_flags); + if (ret < 0) + return ret; + fsnotify_access(file_in); + fsnotify_modify(file_out); return ret; } EXPORT_SYMBOL(do_clone_file_range); -int vfs_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len) +loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) { - int ret; + loff_t ret; file_start_write(file_out); - ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len); + ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len, + remap_flags); file_end_write(file_out); return ret; } EXPORT_SYMBOL(vfs_clone_file_range); -/* - * Read a page's worth of file data into the page cache. Return the page - * locked. - */ -static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset) -{ - struct address_space *mapping; - struct page *page; - pgoff_t n; - - n = offset >> PAGE_SHIFT; - mapping = inode->i_mapping; - page = read_mapping_page(mapping, n, NULL); - if (IS_ERR(page)) - return page; - if (!PageUptodate(page)) { - put_page(page); - return ERR_PTR(-EIO); - } - lock_page(page); - return page; -} - -/* - * Compare extents of two files to see if they are the same. - * Caller must have locked both inodes to prevent write races. - */ -int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, - struct inode *dest, loff_t destoff, - loff_t len, bool *is_same) -{ - loff_t src_poff; - loff_t dest_poff; - void *src_addr; - void *dest_addr; - struct page *src_page; - struct page *dest_page; - loff_t cmp_len; - bool same; - int error; - - error = -EINVAL; - same = true; - while (len) { - src_poff = srcoff & (PAGE_SIZE - 1); - dest_poff = destoff & (PAGE_SIZE - 1); - cmp_len = min(PAGE_SIZE - src_poff, - PAGE_SIZE - dest_poff); - cmp_len = min(cmp_len, len); - if (cmp_len <= 0) - goto out_error; - - src_page = vfs_dedupe_get_page(src, srcoff); - if (IS_ERR(src_page)) { - error = PTR_ERR(src_page); - goto out_error; - } - dest_page = vfs_dedupe_get_page(dest, destoff); - if (IS_ERR(dest_page)) { - error = PTR_ERR(dest_page); - unlock_page(src_page); - put_page(src_page); - goto out_error; - } - src_addr = kmap_atomic(src_page); - dest_addr = kmap_atomic(dest_page); - - flush_dcache_page(src_page); - flush_dcache_page(dest_page); - - if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) - same = false; - - kunmap_atomic(dest_addr); - kunmap_atomic(src_addr); - unlock_page(dest_page); - unlock_page(src_page); - put_page(dest_page); - put_page(src_page); - - if (!same) - break; - - srcoff += cmp_len; - destoff += cmp_len; - len -= cmp_len; - } - - *is_same = same; - return 0; - -out_error: - return error; -} -EXPORT_SYMBOL(vfs_dedupe_file_range_compare); - /* Check whether we are allowed to dedupe the destination file */ static bool allow_file_dedupe(struct file *file) { @@ -1990,16 +2027,20 @@ static bool allow_file_dedupe(struct file *file) return false; } -int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, - struct file *dst_file, loff_t dst_pos, u64 len) +loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, + struct file *dst_file, loff_t dst_pos, + loff_t len, unsigned int remap_flags) { - s64 ret; + loff_t ret; + + WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP | + REMAP_FILE_CAN_SHORTEN)); ret = mnt_want_write_file(dst_file); if (ret) return ret; - ret = clone_verify_area(dst_file, dst_pos, len, true); + ret = remap_verify_area(dst_file, dst_pos, len, true); if (ret < 0) goto out_drop_write; @@ -2016,11 +2057,16 @@ int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, goto out_drop_write; ret = -EINVAL; - if (!dst_file->f_op->dedupe_file_range) + if (!dst_file->f_op->remap_file_range) goto out_drop_write; - ret = dst_file->f_op->dedupe_file_range(src_file, src_pos, - dst_file, dst_pos, len); + if (len == 0) { + ret = 0; + goto out_drop_write; + } + + ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file, + dst_pos, len, remap_flags | REMAP_FILE_DEDUP); out_drop_write: mnt_drop_write_file(dst_file); @@ -2037,7 +2083,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) int i; int ret; u16 count = same->dest_count; - int deduped; + loff_t deduped; if (!(file->f_mode & FMODE_READ)) return -EINVAL; @@ -2056,7 +2102,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) if (!S_ISREG(src->i_mode)) goto out; - ret = clone_verify_area(file, off, len, false); + ret = remap_verify_area(file, off, len, false); if (ret < 0) goto out; ret = 0; @@ -2088,7 +2134,8 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) } deduped = vfs_dedupe_file_range_one(file, off, dst_file, - info->dest_offset, len); + info->dest_offset, len, + REMAP_FILE_CAN_SHORTEN); if (deduped == -EBADE) info->status = FILE_DEDUPE_RANGE_DIFFERS; else if (deduped < 0) |