diff options
Diffstat (limited to 'fs/xfs/xfs_file.c')
-rw-r--r-- | fs/xfs/xfs_file.c | 573 |
1 files changed, 366 insertions, 207 deletions
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 4cdc54dc9686..b19916b11fd5 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -348,9 +348,82 @@ xfs_file_splice_read( } /* + * Take care of zeroing post-EOF blocks when they might exist. + * + * Returns 0 if successfully, a negative error for a failure, or 1 if this + * function dropped the iolock and reacquired it exclusively and the caller + * needs to restart the write sanity checks. + */ +static ssize_t +xfs_file_write_zero_eof( + struct kiocb *iocb, + struct iov_iter *from, + unsigned int *iolock, + size_t count, + bool *drained_dio) +{ + struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); + loff_t isize; + int error; + + /* + * We need to serialise against EOF updates that occur in IO completions + * here. We want to make sure that nobody is changing the size while + * we do this check until we have placed an IO barrier (i.e. hold + * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The + * spinlock effectively forms a memory barrier once we have + * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and + * hence be able to correctly determine if we need to run zeroing. + */ + spin_lock(&ip->i_flags_lock); + isize = i_size_read(VFS_I(ip)); + if (iocb->ki_pos <= isize) { + spin_unlock(&ip->i_flags_lock); + return 0; + } + spin_unlock(&ip->i_flags_lock); + + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + + if (!*drained_dio) { + /* + * If zeroing is needed and we are currently holding the iolock + * shared, we need to update it to exclusive which implies + * having to redo all checks before. + */ + if (*iolock == XFS_IOLOCK_SHARED) { + xfs_iunlock(ip, *iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_ilock(ip, *iolock); + iov_iter_reexpand(from, count); + } + + /* + * We now have an IO submission barrier in place, but AIO can do + * EOF updates during IO completion and hence we now need to + * wait for all of them to drain. Non-AIO DIO will have drained + * before we are given the XFS_IOLOCK_EXCL, and so for most + * cases this wait is a no-op. + */ + inode_dio_wait(VFS_I(ip)); + *drained_dio = true; + return 1; + } + + trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); + + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL); + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); + + return error; +} + +/* * Common pre-write limit and setup checks. * - * Called with the iolocked held either shared and exclusive according to + * Called with the iolock held either shared and exclusive according to * @iolock, and returns with it held. Might upgrade the iolock to exclusive * if called for a direct write beyond i_size. */ @@ -360,13 +433,10 @@ xfs_file_write_checks( struct iov_iter *from, unsigned int *iolock) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct xfs_inode *ip = XFS_I(inode); - ssize_t error = 0; + struct inode *inode = iocb->ki_filp->f_mapping->host; size_t count = iov_iter_count(from); bool drained_dio = false; - loff_t isize; + ssize_t error; restart: error = generic_write_checks(iocb, from); @@ -389,7 +459,7 @@ restart: * exclusively. */ if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { - xfs_iunlock(ip, *iolock); + xfs_iunlock(XFS_I(inode), *iolock); *iolock = XFS_IOLOCK_EXCL; error = xfs_ilock_iocb(iocb, *iolock); if (error) { @@ -400,64 +470,24 @@ restart: } /* - * If the offset is beyond the size of the file, we need to zero any + * If the offset is beyond the size of the file, we need to zero all * blocks that fall between the existing EOF and the start of this - * write. If zeroing is needed and we are currently holding the iolock - * shared, we need to update it to exclusive which implies having to - * redo all checks before. - * - * We need to serialise against EOF updates that occur in IO completions - * here. We want to make sure that nobody is changing the size while we - * do this check until we have placed an IO barrier (i.e. hold the - * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The - * spinlock effectively forms a memory barrier once we have the - * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and - * hence be able to correctly determine if we need to run zeroing. + * write. * - * We can do an unlocked check here safely as IO completion can only - * extend EOF. Truncate is locked out at this point, so the EOF can - * not move backwards, only forwards. Hence we only need to take the - * slow path and spin locks when we are at or beyond the current EOF. + * We can do an unlocked check for i_size here safely as I/O completion + * can only extend EOF. Truncate is locked out at this point, so the + * EOF can not move backwards, only forwards. Hence we only need to take + * the slow path when we are at or beyond the current EOF. */ - if (iocb->ki_pos <= i_size_read(inode)) - goto out; - - spin_lock(&ip->i_flags_lock); - isize = i_size_read(inode); - if (iocb->ki_pos > isize) { - spin_unlock(&ip->i_flags_lock); - - if (iocb->ki_flags & IOCB_NOWAIT) - return -EAGAIN; - - if (!drained_dio) { - if (*iolock == XFS_IOLOCK_SHARED) { - xfs_iunlock(ip, *iolock); - *iolock = XFS_IOLOCK_EXCL; - xfs_ilock(ip, *iolock); - iov_iter_reexpand(from, count); - } - /* - * We now have an IO submission barrier in place, but - * AIO can do EOF updates during IO completion and hence - * we now need to wait for all of them to drain. Non-AIO - * DIO will have drained before we are given the - * XFS_IOLOCK_EXCL, and so for most cases this wait is a - * no-op. - */ - inode_dio_wait(inode); - drained_dio = true; + if (iocb->ki_pos > i_size_read(inode)) { + error = xfs_file_write_zero_eof(iocb, from, iolock, count, + &drained_dio); + if (error == 1) goto restart; - } - - trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); - error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL); if (error) return error; - } else - spin_unlock(&ip->i_flags_lock); + } -out: return kiocb_modified(iocb); } @@ -760,7 +790,7 @@ write_retry: trace_xfs_file_buffered_write(iocb, from); ret = iomap_file_buffered_write(iocb, from, - &xfs_buffered_write_iomap_ops); + &xfs_buffered_write_iomap_ops, NULL); /* * If we hit a space limit, try to free up some lingering preallocated @@ -852,6 +882,192 @@ static inline bool xfs_file_sync_writes(struct file *filp) return false; } +static int +xfs_falloc_newsize( + struct file *file, + int mode, + loff_t offset, + loff_t len, + loff_t *new_size) +{ + struct inode *inode = file_inode(file); + + if ((mode & FALLOC_FL_KEEP_SIZE) || offset + len <= i_size_read(inode)) + return 0; + *new_size = offset + len; + return inode_newsize_ok(inode, *new_size); +} + +static int +xfs_falloc_setsize( + struct file *file, + loff_t new_size) +{ + struct iattr iattr = { + .ia_valid = ATTR_SIZE, + .ia_size = new_size, + }; + + if (!new_size) + return 0; + return xfs_vn_setattr_size(file_mnt_idmap(file), file_dentry(file), + &iattr); +} + +static int +xfs_falloc_collapse_range( + struct file *file, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + loff_t new_size = i_size_read(inode) - len; + int error; + + if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len)) + return -EINVAL; + + /* + * There is no need to overlap collapse range with EOF, in which case it + * is effectively a truncate operation + */ + if (offset + len >= i_size_read(inode)) + return -EINVAL; + + error = xfs_collapse_file_space(XFS_I(inode), offset, len); + if (error) + return error; + return xfs_falloc_setsize(file, new_size); +} + +static int +xfs_falloc_insert_range( + struct file *file, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + loff_t isize = i_size_read(inode); + int error; + + if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len)) + return -EINVAL; + + /* + * New inode size must not exceed ->s_maxbytes, accounting for + * possible signed overflow. + */ + if (inode->i_sb->s_maxbytes - isize < len) + return -EFBIG; + + /* Offset should be less than i_size */ + if (offset >= isize) + return -EINVAL; + + error = xfs_falloc_setsize(file, isize + len); + if (error) + return error; + + /* + * Perform hole insertion now that the file size has been updated so + * that if we crash during the operation we don't leave shifted extents + * past EOF and hence losing access to the data that is contained within + * them. + */ + return xfs_insert_file_space(XFS_I(inode), offset, len); +} + +/* + * Punch a hole and prealloc the range. We use a hole punch rather than + * unwritten extent conversion for two reasons: + * + * 1.) Hole punch handles partial block zeroing for us. + * 2.) If prealloc returns ENOSPC, the file range is still zero-valued by + * virtue of the hole punch. + */ +static int +xfs_falloc_zero_range( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + unsigned int blksize = i_blocksize(inode); + loff_t new_size = 0; + int error; + + trace_xfs_zero_file_space(XFS_I(inode)); + + error = xfs_falloc_newsize(file, mode, offset, len, &new_size); + if (error) + return error; + + error = xfs_free_file_space(XFS_I(inode), offset, len); + if (error) + return error; + + len = round_up(offset + len, blksize) - round_down(offset, blksize); + offset = round_down(offset, blksize); + error = xfs_alloc_file_space(XFS_I(inode), offset, len); + if (error) + return error; + return xfs_falloc_setsize(file, new_size); +} + +static int +xfs_falloc_unshare_range( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + loff_t new_size = 0; + int error; + + error = xfs_falloc_newsize(file, mode, offset, len, &new_size); + if (error) + return error; + + error = xfs_reflink_unshare(XFS_I(inode), offset, len); + if (error) + return error; + + error = xfs_alloc_file_space(XFS_I(inode), offset, len); + if (error) + return error; + return xfs_falloc_setsize(file, new_size); +} + +static int +xfs_falloc_allocate_range( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + loff_t new_size = 0; + int error; + + /* + * If always_cow mode we can't use preallocations and thus should not + * create them. + */ + if (xfs_is_always_cow_inode(XFS_I(inode))) + return -EOPNOTSUPP; + + error = xfs_falloc_newsize(file, mode, offset, len, &new_size); + if (error) + return error; + + error = xfs_alloc_file_space(XFS_I(inode), offset, len); + if (error) + return error; + return xfs_falloc_setsize(file, new_size); +} + #define XFS_FALLOC_FL_SUPPORTED \ (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \ @@ -868,8 +1084,6 @@ xfs_file_fallocate( struct xfs_inode *ip = XFS_I(inode); long error; uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; - loff_t new_size = 0; - bool do_file_insert = false; if (!S_ISREG(inode->i_mode)) return -EINVAL; @@ -890,156 +1104,35 @@ xfs_file_fallocate( */ inode_dio_wait(inode); - /* - * Now AIO and DIO has drained we flush and (if necessary) invalidate - * the cached range over the first operation we are about to run. - * - * We care about zero and collapse here because they both run a hole - * punch over the range first. Because that can zero data, and the range - * of invalidation for the shift operations is much larger, we still do - * the required flush for collapse in xfs_prepare_shift(). - * - * Insert has the same range requirements as collapse, and we extend the - * file first which can zero data. Hence insert has the same - * flush/invalidate requirements as collapse and so they are both - * handled at the right time by xfs_prepare_shift(). - */ - if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE | - FALLOC_FL_COLLAPSE_RANGE)) { - error = xfs_flush_unmap_range(ip, offset, len); - if (error) - goto out_unlock; - } - error = file_modified(file); if (error) goto out_unlock; - if (mode & FALLOC_FL_PUNCH_HOLE) { + switch (mode & FALLOC_FL_MODE_MASK) { + case FALLOC_FL_PUNCH_HOLE: error = xfs_free_file_space(ip, offset, len); - if (error) - goto out_unlock; - } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { - if (!xfs_is_falloc_aligned(ip, offset, len)) { - error = -EINVAL; - goto out_unlock; - } - - /* - * There is no need to overlap collapse range with EOF, - * in which case it is effectively a truncate operation - */ - if (offset + len >= i_size_read(inode)) { - error = -EINVAL; - goto out_unlock; - } - - new_size = i_size_read(inode) - len; - - error = xfs_collapse_file_space(ip, offset, len); - if (error) - goto out_unlock; - } else if (mode & FALLOC_FL_INSERT_RANGE) { - loff_t isize = i_size_read(inode); - - if (!xfs_is_falloc_aligned(ip, offset, len)) { - error = -EINVAL; - goto out_unlock; - } - - /* - * New inode size must not exceed ->s_maxbytes, accounting for - * possible signed overflow. - */ - if (inode->i_sb->s_maxbytes - isize < len) { - error = -EFBIG; - goto out_unlock; - } - new_size = isize + len; - - /* Offset should be less than i_size */ - if (offset >= isize) { - error = -EINVAL; - goto out_unlock; - } - do_file_insert = true; - } else { - if (!(mode & FALLOC_FL_KEEP_SIZE) && - offset + len > i_size_read(inode)) { - new_size = offset + len; - error = inode_newsize_ok(inode, new_size); - if (error) - goto out_unlock; - } - - if (mode & FALLOC_FL_ZERO_RANGE) { - /* - * Punch a hole and prealloc the range. We use a hole - * punch rather than unwritten extent conversion for two - * reasons: - * - * 1.) Hole punch handles partial block zeroing for us. - * 2.) If prealloc returns ENOSPC, the file range is - * still zero-valued by virtue of the hole punch. - */ - unsigned int blksize = i_blocksize(inode); - - trace_xfs_zero_file_space(ip); - - error = xfs_free_file_space(ip, offset, len); - if (error) - goto out_unlock; - - len = round_up(offset + len, blksize) - - round_down(offset, blksize); - offset = round_down(offset, blksize); - } else if (mode & FALLOC_FL_UNSHARE_RANGE) { - error = xfs_reflink_unshare(ip, offset, len); - if (error) - goto out_unlock; - } else { - /* - * If always_cow mode we can't use preallocations and - * thus should not create them. - */ - if (xfs_is_always_cow_inode(ip)) { - error = -EOPNOTSUPP; - goto out_unlock; - } - } - - if (!xfs_is_always_cow_inode(ip)) { - error = xfs_alloc_file_space(ip, offset, len); - if (error) - goto out_unlock; - } - } - - /* Change file size if needed */ - if (new_size) { - struct iattr iattr; - - iattr.ia_valid = ATTR_SIZE; - iattr.ia_size = new_size; - error = xfs_vn_setattr_size(file_mnt_idmap(file), - file_dentry(file), &iattr); - if (error) - goto out_unlock; - } - - /* - * Perform hole insertion now that the file size has been - * updated so that if we crash during the operation we don't - * leave shifted extents past EOF and hence losing access to - * the data that is contained within them. - */ - if (do_file_insert) { - error = xfs_insert_file_space(ip, offset, len); - if (error) - goto out_unlock; + break; + case FALLOC_FL_COLLAPSE_RANGE: + error = xfs_falloc_collapse_range(file, offset, len); + break; + case FALLOC_FL_INSERT_RANGE: + error = xfs_falloc_insert_range(file, offset, len); + break; + case FALLOC_FL_ZERO_RANGE: + error = xfs_falloc_zero_range(file, mode, offset, len); + break; + case FALLOC_FL_UNSHARE_RANGE: + error = xfs_falloc_unshare_range(file, mode, offset, len); + break; + case FALLOC_FL_ALLOCATE_RANGE: + error = xfs_falloc_allocate_range(file, mode, offset, len); + break; + default: + error = -EOPNOTSUPP; + break; } - if (xfs_file_sync_writes(file)) + if (!error && xfs_file_sync_writes(file)) error = xfs_log_force_inode(ip); out_unlock: @@ -1175,12 +1268,78 @@ xfs_dir_open( return error; } +/* + * Don't bother propagating errors. We're just doing cleanup, and the caller + * ignores the return value anyway. + */ STATIC int xfs_file_release( - struct inode *inode, - struct file *filp) + struct inode *inode, + struct file *file) { - return xfs_release(XFS_I(inode)); + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + + /* + * If this is a read-only mount or the file system has been shut down, + * don't generate I/O. + */ + if (xfs_is_readonly(mp) || xfs_is_shutdown(mp)) + return 0; + + /* + * If we previously truncated this file and removed old data in the + * process, we want to initiate "early" writeout on the last close. + * This is an attempt to combat the notorious NULL files problem which + * is particularly noticeable from a truncate down, buffered (re-)write + * (delalloc), followed by a crash. What we are effectively doing here + * is significantly reducing the time window where we'd otherwise be + * exposed to that problem. + */ + if (xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED)) { + xfs_iflags_clear(ip, XFS_EOFBLOCKS_RELEASED); + if (ip->i_delayed_blks > 0) + filemap_flush(inode->i_mapping); + } + + /* + * XFS aggressively preallocates post-EOF space to generate contiguous + * allocations for writers that append to the end of the file. + * + * To support workloads that close and reopen the file frequently, these + * preallocations usually persist after a close unless it is the first + * close for the inode. This is a tradeoff to generate tightly packed + * data layouts for unpacking tarballs or similar archives that write + * one file after another without going back to it while keeping the + * preallocation for files that have recurring open/write/close cycles. + * + * This heuristic is skipped for inodes with the append-only flag as + * that flag is rather pointless for inodes written only once. + * + * There is no point in freeing blocks here for open but unlinked files + * as they will be taken care of by the inactivation path soon. + * + * When releasing a read-only context, don't flush data or trim post-EOF + * blocks. This avoids open/read/close workloads from removing EOF + * blocks that other writers depend upon to reduce fragmentation. + * + * If we can't get the iolock just skip truncating the blocks past EOF + * because we could deadlock with the mmap_lock otherwise. We'll get + * another chance to drop them once the last reference to the inode is + * dropped, so we'll never leak blocks permanently. + */ + if (inode->i_nlink && + (file->f_mode & FMODE_WRITE) && + !(ip->i_diflags & XFS_DIFLAG_APPEND) && + !xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) && + xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + if (xfs_can_free_eofblocks(ip) && + !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED)) + xfs_free_eofblocks(ip); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + } + + return 0; } STATIC int |