diff options
| -rw-r--r-- | fs/xfs/xfs_reflink.c | 200 | 
1 files changed, 165 insertions, 35 deletions
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 5289e22cb081..42ea7bab9144 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1220,35 +1220,92 @@ retry:  	return 0;  } +/* Unlock both inodes after they've been prepped for a range clone. */ +STATIC void +xfs_reflink_remap_unlock( +	struct file		*file_in, +	struct file		*file_out) +{ +	struct inode		*inode_in = file_inode(file_in); +	struct xfs_inode	*src = XFS_I(inode_in); +	struct inode		*inode_out = file_inode(file_out); +	struct xfs_inode	*dest = XFS_I(inode_out); +	bool			same_inode = (inode_in == inode_out); + +	xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); +	if (!same_inode) +		xfs_iunlock(src, XFS_MMAPLOCK_SHARED); +	inode_unlock(inode_out); +	if (!same_inode) +		inode_unlock_shared(inode_in); +} +  /* - * Link a range of blocks from one file to another. + * If we're reflinking to a point past the destination file's EOF, we must + * zero any speculative post-EOF preallocations that sit between the old EOF + * and the destination file offset.   */ -int -xfs_reflink_remap_range( +static int +xfs_reflink_zero_posteof( +	struct xfs_inode	*ip, +	loff_t			pos) +{ +	loff_t			isize = i_size_read(VFS_I(ip)); + +	if (pos <= isize) +		return 0; + +	trace_xfs_zero_eof(ip, isize, pos - isize); +	return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL, +			&xfs_iomap_ops); +} + +/* + * Prepare two files for range cloning.  Upon a successful return both inodes + * will have the iolock and mmaplock held, the page cache of the out file will + * be truncated, and any leases on the out file will have been broken.  This + * function borrows heavily from xfs_file_aio_write_checks. + * + * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't + * checked that the bytes beyond EOF physically match. Hence we cannot use the + * EOF block in the source dedupe range because it's not a complete block match, + * hence can introduce a corruption into the file that has it's block replaced. + * + * In similar fashion, the VFS file cloning also allows partial EOF blocks to be + * "block aligned" for the purposes of cloning entire files.  However, if the + * source file range includes the EOF block and it lands within the existing EOF + * of the destination file, then we can expose stale data from beyond the source + * file EOF in the destination file. + * + * XFS doesn't support partial block sharing, so in both cases we have check + * these cases ourselves. For dedupe, we can simply round the length to dedupe + * down to the previous whole block and ignore the partial EOF block. While this + * means we can't dedupe the last block of a file, this is an acceptible + * tradeoff for simplicity on implementation. + * + * For cloning, we want to share the partial EOF block if it is also the new EOF + * block of the destination file. If the partial EOF block lies inside the + * existing destination EOF, then we have to abort the clone to avoid exposing + * stale data in the destination file. Hence we reject these clone attempts with + * -EINVAL in this case. + */ +STATIC int +xfs_reflink_remap_prep(  	struct file		*file_in,  	loff_t			pos_in,  	struct file		*file_out,  	loff_t			pos_out, -	u64			len, +	u64			*len,  	bool			is_dedupe)  {  	struct inode		*inode_in = file_inode(file_in);  	struct xfs_inode	*src = XFS_I(inode_in);  	struct inode		*inode_out = file_inode(file_out);  	struct xfs_inode	*dest = XFS_I(inode_out); -	struct xfs_mount	*mp = src->i_mount;  	bool			same_inode = (inode_in == inode_out); -	xfs_fileoff_t		sfsbno, dfsbno; -	xfs_filblks_t		fsblen; -	xfs_extlen_t		cowextsize; +	u64			blkmask = i_blocksize(inode_in) - 1;  	ssize_t			ret; -	if (!xfs_sb_version_hasreflink(&mp->m_sb)) -		return -EOPNOTSUPP; - -	if (XFS_FORCED_SHUTDOWN(mp)) -		return -EIO; -  	/* Lock both files against IO */  	ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out);  	if (ret) @@ -1270,33 +1327,115 @@ xfs_reflink_remap_range(  		goto out_unlock;  	ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out, -			&len, is_dedupe); +			len, is_dedupe);  	if (ret <= 0)  		goto out_unlock; +	/* +	 * If the dedupe data matches, chop off the partial EOF block +	 * from the source file so we don't try to dedupe the partial +	 * EOF block. +	 */ +	if (is_dedupe) { +		*len &= ~blkmask; +	} else if (*len & blkmask) { +		/* +		 * The user is attempting to share a partial EOF block, +		 * if it's inside the destination EOF then reject it. +		 */ +		if (pos_out + *len < i_size_read(inode_out)) { +			ret = -EINVAL; +			goto out_unlock; +		} +	} +  	/* Attach dquots to dest inode before changing block map */  	ret = xfs_qm_dqattach(dest);  	if (ret)  		goto out_unlock; -	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); -  	/* -	 * Clear out post-eof preallocations because we don't have page cache -	 * backing the delayed allocations and they'll never get freed on -	 * their own. +	 * Zero existing post-eof speculative preallocations in the destination +	 * file.  	 */ -	if (xfs_can_free_eofblocks(dest, true)) { -		ret = xfs_free_eofblocks(dest); -		if (ret) -			goto out_unlock; -	} +	ret = xfs_reflink_zero_posteof(dest, pos_out); +	if (ret) +		goto out_unlock;  	/* Set flags and remap blocks. */  	ret = xfs_reflink_set_inode_flag(src, dest);  	if (ret)  		goto out_unlock; +	/* Zap any page cache for the destination file's range. */ +	truncate_inode_pages_range(&inode_out->i_data, pos_out, +				   PAGE_ALIGN(pos_out + *len) - 1); + +	/* If we're altering the file contents... */ +	if (!is_dedupe) { +		/* +		 * ...update the timestamps (which will grab the ilock again +		 * from xfs_fs_dirty_inode, so we have to call it before we +		 * take the ilock). +		 */ +		if (!(file_out->f_mode & FMODE_NOCMTIME)) { +			ret = file_update_time(file_out); +			if (ret) +				goto out_unlock; +		} + +		/* +		 * ...clear the security bits if the process is not being run +		 * by root.  This keeps people from modifying setuid and setgid +		 * binaries. +		 */ +		ret = file_remove_privs(file_out); +		if (ret) +			goto out_unlock; +	} + +	return 1; +out_unlock: +	xfs_reflink_remap_unlock(file_in, file_out); +	return ret; +} + +/* + * Link a range of blocks from one file to another. + */ +int +xfs_reflink_remap_range( +	struct file		*file_in, +	loff_t			pos_in, +	struct file		*file_out, +	loff_t			pos_out, +	u64			len, +	bool			is_dedupe) +{ +	struct inode		*inode_in = file_inode(file_in); +	struct xfs_inode	*src = XFS_I(inode_in); +	struct inode		*inode_out = file_inode(file_out); +	struct xfs_inode	*dest = XFS_I(inode_out); +	struct xfs_mount	*mp = src->i_mount; +	xfs_fileoff_t		sfsbno, dfsbno; +	xfs_filblks_t		fsblen; +	xfs_extlen_t		cowextsize; +	ssize_t			ret; + +	if (!xfs_sb_version_hasreflink(&mp->m_sb)) +		return -EOPNOTSUPP; + +	if (XFS_FORCED_SHUTDOWN(mp)) +		return -EIO; + +	/* Prepare and then clone file data. */ +	ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out, +			&len, is_dedupe); +	if (ret <= 0) +		return ret; + +	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); +  	dfsbno = XFS_B_TO_FSBT(mp, pos_out);  	sfsbno = XFS_B_TO_FSBT(mp, pos_in);  	fsblen = XFS_B_TO_FSB(mp, len); @@ -1305,10 +1444,6 @@ xfs_reflink_remap_range(  	if (ret)  		goto out_unlock; -	/* Zap any page cache for the destination file's range. */ -	truncate_inode_pages_range(&inode_out->i_data, pos_out, -				   PAGE_ALIGN(pos_out + len) - 1); -  	/*  	 * Carry the cowextsize hint from src to dest if we're sharing the  	 * entire source file to the entire destination file, the source file @@ -1325,12 +1460,7 @@ xfs_reflink_remap_range(  			is_dedupe);  out_unlock: -	xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); -	if (!same_inode) -		xfs_iunlock(src, XFS_MMAPLOCK_SHARED); -	inode_unlock(inode_out); -	if (!same_inode) -		inode_unlock_shared(inode_in); +	xfs_reflink_remap_unlock(file_in, file_out);  	if (ret)  		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);  	return ret;  | 
