summaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_file.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_file.c')
-rw-r--r--fs/xfs/xfs_file.c573
1 files changed, 366 insertions, 207 deletions
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 4cdc54dc9686..b19916b11fd5 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -348,9 +348,82 @@ xfs_file_splice_read(
}
/*
+ * Take care of zeroing post-EOF blocks when they might exist.
+ *
+ * Returns 0 if successfully, a negative error for a failure, or 1 if this
+ * function dropped the iolock and reacquired it exclusively and the caller
+ * needs to restart the write sanity checks.
+ */
+static ssize_t
+xfs_file_write_zero_eof(
+ struct kiocb *iocb,
+ struct iov_iter *from,
+ unsigned int *iolock,
+ size_t count,
+ bool *drained_dio)
+{
+ struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
+ loff_t isize;
+ int error;
+
+ /*
+ * We need to serialise against EOF updates that occur in IO completions
+ * here. We want to make sure that nobody is changing the size while
+ * we do this check until we have placed an IO barrier (i.e. hold
+ * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The
+ * spinlock effectively forms a memory barrier once we have
+ * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
+ * hence be able to correctly determine if we need to run zeroing.
+ */
+ spin_lock(&ip->i_flags_lock);
+ isize = i_size_read(VFS_I(ip));
+ if (iocb->ki_pos <= isize) {
+ spin_unlock(&ip->i_flags_lock);
+ return 0;
+ }
+ spin_unlock(&ip->i_flags_lock);
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+
+ if (!*drained_dio) {
+ /*
+ * If zeroing is needed and we are currently holding the iolock
+ * shared, we need to update it to exclusive which implies
+ * having to redo all checks before.
+ */
+ if (*iolock == XFS_IOLOCK_SHARED) {
+ xfs_iunlock(ip, *iolock);
+ *iolock = XFS_IOLOCK_EXCL;
+ xfs_ilock(ip, *iolock);
+ iov_iter_reexpand(from, count);
+ }
+
+ /*
+ * We now have an IO submission barrier in place, but AIO can do
+ * EOF updates during IO completion and hence we now need to
+ * wait for all of them to drain. Non-AIO DIO will have drained
+ * before we are given the XFS_IOLOCK_EXCL, and so for most
+ * cases this wait is a no-op.
+ */
+ inode_dio_wait(VFS_I(ip));
+ *drained_dio = true;
+ return 1;
+ }
+
+ trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
+
+ xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+ error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
+ xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
+
+ return error;
+}
+
+/*
* Common pre-write limit and setup checks.
*
- * Called with the iolocked held either shared and exclusive according to
+ * Called with the iolock held either shared and exclusive according to
* @iolock, and returns with it held. Might upgrade the iolock to exclusive
* if called for a direct write beyond i_size.
*/
@@ -360,13 +433,10 @@ xfs_file_write_checks(
struct iov_iter *from,
unsigned int *iolock)
{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
- ssize_t error = 0;
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
size_t count = iov_iter_count(from);
bool drained_dio = false;
- loff_t isize;
+ ssize_t error;
restart:
error = generic_write_checks(iocb, from);
@@ -389,7 +459,7 @@ restart:
* exclusively.
*/
if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
- xfs_iunlock(ip, *iolock);
+ xfs_iunlock(XFS_I(inode), *iolock);
*iolock = XFS_IOLOCK_EXCL;
error = xfs_ilock_iocb(iocb, *iolock);
if (error) {
@@ -400,64 +470,24 @@ restart:
}
/*
- * If the offset is beyond the size of the file, we need to zero any
+ * If the offset is beyond the size of the file, we need to zero all
* blocks that fall between the existing EOF and the start of this
- * write. If zeroing is needed and we are currently holding the iolock
- * shared, we need to update it to exclusive which implies having to
- * redo all checks before.
- *
- * We need to serialise against EOF updates that occur in IO completions
- * here. We want to make sure that nobody is changing the size while we
- * do this check until we have placed an IO barrier (i.e. hold the
- * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The
- * spinlock effectively forms a memory barrier once we have the
- * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
- * hence be able to correctly determine if we need to run zeroing.
+ * write.
*
- * We can do an unlocked check here safely as IO completion can only
- * extend EOF. Truncate is locked out at this point, so the EOF can
- * not move backwards, only forwards. Hence we only need to take the
- * slow path and spin locks when we are at or beyond the current EOF.
+ * We can do an unlocked check for i_size here safely as I/O completion
+ * can only extend EOF. Truncate is locked out at this point, so the
+ * EOF can not move backwards, only forwards. Hence we only need to take
+ * the slow path when we are at or beyond the current EOF.
*/
- if (iocb->ki_pos <= i_size_read(inode))
- goto out;
-
- spin_lock(&ip->i_flags_lock);
- isize = i_size_read(inode);
- if (iocb->ki_pos > isize) {
- spin_unlock(&ip->i_flags_lock);
-
- if (iocb->ki_flags & IOCB_NOWAIT)
- return -EAGAIN;
-
- if (!drained_dio) {
- if (*iolock == XFS_IOLOCK_SHARED) {
- xfs_iunlock(ip, *iolock);
- *iolock = XFS_IOLOCK_EXCL;
- xfs_ilock(ip, *iolock);
- iov_iter_reexpand(from, count);
- }
- /*
- * We now have an IO submission barrier in place, but
- * AIO can do EOF updates during IO completion and hence
- * we now need to wait for all of them to drain. Non-AIO
- * DIO will have drained before we are given the
- * XFS_IOLOCK_EXCL, and so for most cases this wait is a
- * no-op.
- */
- inode_dio_wait(inode);
- drained_dio = true;
+ if (iocb->ki_pos > i_size_read(inode)) {
+ error = xfs_file_write_zero_eof(iocb, from, iolock, count,
+ &drained_dio);
+ if (error == 1)
goto restart;
- }
-
- trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
- error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
if (error)
return error;
- } else
- spin_unlock(&ip->i_flags_lock);
+ }
-out:
return kiocb_modified(iocb);
}
@@ -760,7 +790,7 @@ write_retry:
trace_xfs_file_buffered_write(iocb, from);
ret = iomap_file_buffered_write(iocb, from,
- &xfs_buffered_write_iomap_ops);
+ &xfs_buffered_write_iomap_ops, NULL);
/*
* If we hit a space limit, try to free up some lingering preallocated
@@ -852,6 +882,192 @@ static inline bool xfs_file_sync_writes(struct file *filp)
return false;
}
+static int
+xfs_falloc_newsize(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len,
+ loff_t *new_size)
+{
+ struct inode *inode = file_inode(file);
+
+ if ((mode & FALLOC_FL_KEEP_SIZE) || offset + len <= i_size_read(inode))
+ return 0;
+ *new_size = offset + len;
+ return inode_newsize_ok(inode, *new_size);
+}
+
+static int
+xfs_falloc_setsize(
+ struct file *file,
+ loff_t new_size)
+{
+ struct iattr iattr = {
+ .ia_valid = ATTR_SIZE,
+ .ia_size = new_size,
+ };
+
+ if (!new_size)
+ return 0;
+ return xfs_vn_setattr_size(file_mnt_idmap(file), file_dentry(file),
+ &iattr);
+}
+
+static int
+xfs_falloc_collapse_range(
+ struct file *file,
+ loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ loff_t new_size = i_size_read(inode) - len;
+ int error;
+
+ if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len))
+ return -EINVAL;
+
+ /*
+ * There is no need to overlap collapse range with EOF, in which case it
+ * is effectively a truncate operation
+ */
+ if (offset + len >= i_size_read(inode))
+ return -EINVAL;
+
+ error = xfs_collapse_file_space(XFS_I(inode), offset, len);
+ if (error)
+ return error;
+ return xfs_falloc_setsize(file, new_size);
+}
+
+static int
+xfs_falloc_insert_range(
+ struct file *file,
+ loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ loff_t isize = i_size_read(inode);
+ int error;
+
+ if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len))
+ return -EINVAL;
+
+ /*
+ * New inode size must not exceed ->s_maxbytes, accounting for
+ * possible signed overflow.
+ */
+ if (inode->i_sb->s_maxbytes - isize < len)
+ return -EFBIG;
+
+ /* Offset should be less than i_size */
+ if (offset >= isize)
+ return -EINVAL;
+
+ error = xfs_falloc_setsize(file, isize + len);
+ if (error)
+ return error;
+
+ /*
+ * Perform hole insertion now that the file size has been updated so
+ * that if we crash during the operation we don't leave shifted extents
+ * past EOF and hence losing access to the data that is contained within
+ * them.
+ */
+ return xfs_insert_file_space(XFS_I(inode), offset, len);
+}
+
+/*
+ * Punch a hole and prealloc the range. We use a hole punch rather than
+ * unwritten extent conversion for two reasons:
+ *
+ * 1.) Hole punch handles partial block zeroing for us.
+ * 2.) If prealloc returns ENOSPC, the file range is still zero-valued by
+ * virtue of the hole punch.
+ */
+static int
+xfs_falloc_zero_range(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ unsigned int blksize = i_blocksize(inode);
+ loff_t new_size = 0;
+ int error;
+
+ trace_xfs_zero_file_space(XFS_I(inode));
+
+ error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
+ if (error)
+ return error;
+
+ error = xfs_free_file_space(XFS_I(inode), offset, len);
+ if (error)
+ return error;
+
+ len = round_up(offset + len, blksize) - round_down(offset, blksize);
+ offset = round_down(offset, blksize);
+ error = xfs_alloc_file_space(XFS_I(inode), offset, len);
+ if (error)
+ return error;
+ return xfs_falloc_setsize(file, new_size);
+}
+
+static int
+xfs_falloc_unshare_range(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ loff_t new_size = 0;
+ int error;
+
+ error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
+ if (error)
+ return error;
+
+ error = xfs_reflink_unshare(XFS_I(inode), offset, len);
+ if (error)
+ return error;
+
+ error = xfs_alloc_file_space(XFS_I(inode), offset, len);
+ if (error)
+ return error;
+ return xfs_falloc_setsize(file, new_size);
+}
+
+static int
+xfs_falloc_allocate_range(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ loff_t new_size = 0;
+ int error;
+
+ /*
+ * If always_cow mode we can't use preallocations and thus should not
+ * create them.
+ */
+ if (xfs_is_always_cow_inode(XFS_I(inode)))
+ return -EOPNOTSUPP;
+
+ error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
+ if (error)
+ return error;
+
+ error = xfs_alloc_file_space(XFS_I(inode), offset, len);
+ if (error)
+ return error;
+ return xfs_falloc_setsize(file, new_size);
+}
+
#define XFS_FALLOC_FL_SUPPORTED \
(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \
@@ -868,8 +1084,6 @@ xfs_file_fallocate(
struct xfs_inode *ip = XFS_I(inode);
long error;
uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
- loff_t new_size = 0;
- bool do_file_insert = false;
if (!S_ISREG(inode->i_mode))
return -EINVAL;
@@ -890,156 +1104,35 @@ xfs_file_fallocate(
*/
inode_dio_wait(inode);
- /*
- * Now AIO and DIO has drained we flush and (if necessary) invalidate
- * the cached range over the first operation we are about to run.
- *
- * We care about zero and collapse here because they both run a hole
- * punch over the range first. Because that can zero data, and the range
- * of invalidation for the shift operations is much larger, we still do
- * the required flush for collapse in xfs_prepare_shift().
- *
- * Insert has the same range requirements as collapse, and we extend the
- * file first which can zero data. Hence insert has the same
- * flush/invalidate requirements as collapse and so they are both
- * handled at the right time by xfs_prepare_shift().
- */
- if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
- FALLOC_FL_COLLAPSE_RANGE)) {
- error = xfs_flush_unmap_range(ip, offset, len);
- if (error)
- goto out_unlock;
- }
-
error = file_modified(file);
if (error)
goto out_unlock;
- if (mode & FALLOC_FL_PUNCH_HOLE) {
+ switch (mode & FALLOC_FL_MODE_MASK) {
+ case FALLOC_FL_PUNCH_HOLE:
error = xfs_free_file_space(ip, offset, len);
- if (error)
- goto out_unlock;
- } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
- if (!xfs_is_falloc_aligned(ip, offset, len)) {
- error = -EINVAL;
- goto out_unlock;
- }
-
- /*
- * There is no need to overlap collapse range with EOF,
- * in which case it is effectively a truncate operation
- */
- if (offset + len >= i_size_read(inode)) {
- error = -EINVAL;
- goto out_unlock;
- }
-
- new_size = i_size_read(inode) - len;
-
- error = xfs_collapse_file_space(ip, offset, len);
- if (error)
- goto out_unlock;
- } else if (mode & FALLOC_FL_INSERT_RANGE) {
- loff_t isize = i_size_read(inode);
-
- if (!xfs_is_falloc_aligned(ip, offset, len)) {
- error = -EINVAL;
- goto out_unlock;
- }
-
- /*
- * New inode size must not exceed ->s_maxbytes, accounting for
- * possible signed overflow.
- */
- if (inode->i_sb->s_maxbytes - isize < len) {
- error = -EFBIG;
- goto out_unlock;
- }
- new_size = isize + len;
-
- /* Offset should be less than i_size */
- if (offset >= isize) {
- error = -EINVAL;
- goto out_unlock;
- }
- do_file_insert = true;
- } else {
- if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- offset + len > i_size_read(inode)) {
- new_size = offset + len;
- error = inode_newsize_ok(inode, new_size);
- if (error)
- goto out_unlock;
- }
-
- if (mode & FALLOC_FL_ZERO_RANGE) {
- /*
- * Punch a hole and prealloc the range. We use a hole
- * punch rather than unwritten extent conversion for two
- * reasons:
- *
- * 1.) Hole punch handles partial block zeroing for us.
- * 2.) If prealloc returns ENOSPC, the file range is
- * still zero-valued by virtue of the hole punch.
- */
- unsigned int blksize = i_blocksize(inode);
-
- trace_xfs_zero_file_space(ip);
-
- error = xfs_free_file_space(ip, offset, len);
- if (error)
- goto out_unlock;
-
- len = round_up(offset + len, blksize) -
- round_down(offset, blksize);
- offset = round_down(offset, blksize);
- } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
- error = xfs_reflink_unshare(ip, offset, len);
- if (error)
- goto out_unlock;
- } else {
- /*
- * If always_cow mode we can't use preallocations and
- * thus should not create them.
- */
- if (xfs_is_always_cow_inode(ip)) {
- error = -EOPNOTSUPP;
- goto out_unlock;
- }
- }
-
- if (!xfs_is_always_cow_inode(ip)) {
- error = xfs_alloc_file_space(ip, offset, len);
- if (error)
- goto out_unlock;
- }
- }
-
- /* Change file size if needed */
- if (new_size) {
- struct iattr iattr;
-
- iattr.ia_valid = ATTR_SIZE;
- iattr.ia_size = new_size;
- error = xfs_vn_setattr_size(file_mnt_idmap(file),
- file_dentry(file), &iattr);
- if (error)
- goto out_unlock;
- }
-
- /*
- * Perform hole insertion now that the file size has been
- * updated so that if we crash during the operation we don't
- * leave shifted extents past EOF and hence losing access to
- * the data that is contained within them.
- */
- if (do_file_insert) {
- error = xfs_insert_file_space(ip, offset, len);
- if (error)
- goto out_unlock;
+ break;
+ case FALLOC_FL_COLLAPSE_RANGE:
+ error = xfs_falloc_collapse_range(file, offset, len);
+ break;
+ case FALLOC_FL_INSERT_RANGE:
+ error = xfs_falloc_insert_range(file, offset, len);
+ break;
+ case FALLOC_FL_ZERO_RANGE:
+ error = xfs_falloc_zero_range(file, mode, offset, len);
+ break;
+ case FALLOC_FL_UNSHARE_RANGE:
+ error = xfs_falloc_unshare_range(file, mode, offset, len);
+ break;
+ case FALLOC_FL_ALLOCATE_RANGE:
+ error = xfs_falloc_allocate_range(file, mode, offset, len);
+ break;
+ default:
+ error = -EOPNOTSUPP;
+ break;
}
- if (xfs_file_sync_writes(file))
+ if (!error && xfs_file_sync_writes(file))
error = xfs_log_force_inode(ip);
out_unlock:
@@ -1175,12 +1268,78 @@ xfs_dir_open(
return error;
}
+/*
+ * Don't bother propagating errors. We're just doing cleanup, and the caller
+ * ignores the return value anyway.
+ */
STATIC int
xfs_file_release(
- struct inode *inode,
- struct file *filp)
+ struct inode *inode,
+ struct file *file)
{
- return xfs_release(XFS_I(inode));
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+
+ /*
+ * If this is a read-only mount or the file system has been shut down,
+ * don't generate I/O.
+ */
+ if (xfs_is_readonly(mp) || xfs_is_shutdown(mp))
+ return 0;
+
+ /*
+ * If we previously truncated this file and removed old data in the
+ * process, we want to initiate "early" writeout on the last close.
+ * This is an attempt to combat the notorious NULL files problem which
+ * is particularly noticeable from a truncate down, buffered (re-)write
+ * (delalloc), followed by a crash. What we are effectively doing here
+ * is significantly reducing the time window where we'd otherwise be
+ * exposed to that problem.
+ */
+ if (xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED)) {
+ xfs_iflags_clear(ip, XFS_EOFBLOCKS_RELEASED);
+ if (ip->i_delayed_blks > 0)
+ filemap_flush(inode->i_mapping);
+ }
+
+ /*
+ * XFS aggressively preallocates post-EOF space to generate contiguous
+ * allocations for writers that append to the end of the file.
+ *
+ * To support workloads that close and reopen the file frequently, these
+ * preallocations usually persist after a close unless it is the first
+ * close for the inode. This is a tradeoff to generate tightly packed
+ * data layouts for unpacking tarballs or similar archives that write
+ * one file after another without going back to it while keeping the
+ * preallocation for files that have recurring open/write/close cycles.
+ *
+ * This heuristic is skipped for inodes with the append-only flag as
+ * that flag is rather pointless for inodes written only once.
+ *
+ * There is no point in freeing blocks here for open but unlinked files
+ * as they will be taken care of by the inactivation path soon.
+ *
+ * When releasing a read-only context, don't flush data or trim post-EOF
+ * blocks. This avoids open/read/close workloads from removing EOF
+ * blocks that other writers depend upon to reduce fragmentation.
+ *
+ * If we can't get the iolock just skip truncating the blocks past EOF
+ * because we could deadlock with the mmap_lock otherwise. We'll get
+ * another chance to drop them once the last reference to the inode is
+ * dropped, so we'll never leak blocks permanently.
+ */
+ if (inode->i_nlink &&
+ (file->f_mode & FMODE_WRITE) &&
+ !(ip->i_diflags & XFS_DIFLAG_APPEND) &&
+ !xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
+ xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
+ if (xfs_can_free_eofblocks(ip) &&
+ !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED))
+ xfs_free_eofblocks(ip);
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ }
+
+ return 0;
}
STATIC int