diff options
Diffstat (limited to 'fs/xfs/xfs_iops.c')
-rw-r--r-- | fs/xfs/xfs_iops.c | 178 |
1 files changed, 158 insertions, 20 deletions
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index ee79cf161312..8cddbb7c149b 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -29,6 +29,7 @@ #include "xfs_xattr.h" #include "xfs_file.h" #include "xfs_bmap.h" +#include "xfs_zone_alloc.h" #include <linux/posix_acl.h> #include <linux/security.h> @@ -42,7 +43,9 @@ * held. For regular files, the lock order is the other way around - the * mmap_lock is taken during the page fault, and then we lock the ilock to do * block mapping. Hence we need a different class for the directory ilock so - * that lockdep can tell them apart. + * that lockdep can tell them apart. Directories in the metadata directory + * tree get a separate class so that lockdep reports will warn us if someone + * ever tries to lock regular directories after locking metadata directories. */ static struct lock_class_key xfs_nondir_ilock_class; static struct lock_class_key xfs_dir_ilock_class; @@ -296,14 +299,14 @@ xfs_vn_create( return xfs_generic_create(idmap, dir, dentry, mode, 0, NULL); } -STATIC int +STATIC struct dentry * xfs_vn_mkdir( struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { - return xfs_generic_create(idmap, dir, dentry, mode | S_IFDIR, 0, NULL); + return ERR_PTR(xfs_generic_create(idmap, dir, dentry, mode | S_IFDIR, 0, NULL)); } STATIC struct dentry * @@ -570,6 +573,112 @@ xfs_stat_blksize( return max_t(uint32_t, PAGE_SIZE, mp->m_sb.sb_blocksize); } +static void +xfs_report_dioalign( + struct xfs_inode *ip, + struct kstat *stat) +{ + struct xfs_buftarg *target = xfs_inode_buftarg(ip); + struct block_device *bdev = target->bt_bdev; + + stat->result_mask |= STATX_DIOALIGN | STATX_DIO_READ_ALIGN; + stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; + + /* + * For COW inodes, we can only perform out of place writes of entire + * allocation units (blocks or RT extents). + * For writes smaller than the allocation unit, we must fall back to + * buffered I/O to perform read-modify-write cycles. At best this is + * highly inefficient; at worst it leads to page cache invalidation + * races. Tell applications to avoid this by reporting the larger write + * alignment in dio_offset_align, and the smaller read alignment in + * dio_read_offset_align. + */ + stat->dio_read_offset_align = bdev_logical_block_size(bdev); + if (xfs_is_cow_inode(ip)) + stat->dio_offset_align = xfs_inode_alloc_unitsize(ip); + else + stat->dio_offset_align = stat->dio_read_offset_align; +} + +unsigned int +xfs_get_atomic_write_min( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + /* + * If we can complete an atomic write via atomic out of place writes, + * then advertise a minimum size of one fsblock. Without this + * mechanism, we can only guarantee atomic writes up to a single LBA. + * + * If out of place writes are not available, we can guarantee an atomic + * write of exactly one single fsblock if the bdev will make that + * guarantee for us. + */ + if (xfs_inode_can_hw_atomic_write(ip) || xfs_can_sw_atomic_write(mp)) + return mp->m_sb.sb_blocksize; + + return 0; +} + +unsigned int +xfs_get_atomic_write_max( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + /* + * If out of place writes are not available, we can guarantee an atomic + * write of exactly one single fsblock if the bdev will make that + * guarantee for us. + */ + if (!xfs_can_sw_atomic_write(mp)) { + if (xfs_inode_can_hw_atomic_write(ip)) + return mp->m_sb.sb_blocksize; + return 0; + } + + /* + * If we can complete an atomic write via atomic out of place writes, + * then advertise a maximum size of whatever we can complete through + * that means. Hardware support is reported via max_opt, not here. + */ + if (XFS_IS_REALTIME_INODE(ip)) + return XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_RTG].awu_max); + return XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_AG].awu_max); +} + +unsigned int +xfs_get_atomic_write_max_opt( + struct xfs_inode *ip) +{ + unsigned int awu_max = xfs_get_atomic_write_max(ip); + + /* if the max is 1x block, then just keep behaviour that opt is 0 */ + if (awu_max <= ip->i_mount->m_sb.sb_blocksize) + return 0; + + /* + * Advertise the maximum size of an atomic write that we can tell the + * block device to perform for us. In general the bdev limit will be + * less than our out of place write limit, but we don't want to exceed + * the awu_max. + */ + return min(awu_max, xfs_inode_buftarg(ip)->bt_bdev_awu_max); +} + +static void +xfs_report_atomic_write( + struct xfs_inode *ip, + struct kstat *stat) +{ + generic_fill_statx_atomic_writes(stat, + xfs_get_atomic_write_min(ip), + xfs_get_atomic_write_max(ip), + xfs_get_atomic_write_max_opt(ip)); +} + STATIC int xfs_vn_getattr( struct mnt_idmap *idmap, @@ -597,8 +706,9 @@ xfs_vn_getattr( stat->gid = vfsgid_into_kgid(vfsgid); stat->ino = ip->i_ino; stat->atime = inode_get_atime(inode); - stat->mtime = inode_get_mtime(inode); - stat->ctime = inode_get_ctime(inode); + + fill_mg_cmtime(stat, request_mask, inode); + stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks); if (xfs_has_v3inodes(mp)) { @@ -608,11 +718,6 @@ xfs_vn_getattr( } } - if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) { - stat->change_cookie = inode_query_iversion(inode); - stat->result_mask |= STATX_CHANGE_COOKIE; - } - /* * Note: If you add another clause to set an attribute flag, please * update attributes_mask below. @@ -635,14 +740,10 @@ xfs_vn_getattr( stat->rdev = inode->i_rdev; break; case S_IFREG: - if (request_mask & STATX_DIOALIGN) { - struct xfs_buftarg *target = xfs_inode_buftarg(ip); - struct block_device *bdev = target->bt_bdev; - - stat->result_mask |= STATX_DIOALIGN; - stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; - stat->dio_offset_align = bdev_logical_block_size(bdev); - } + if (request_mask & (STATX_DIOALIGN | STATX_DIO_READ_ALIGN)) + xfs_report_dioalign(ip, stat); + if (request_mask & STATX_WRITE_ATOMIC) + xfs_report_atomic_write(ip, stat); fallthrough; default: stat->blksize = xfs_stat_blksize(ip); @@ -820,6 +921,7 @@ xfs_setattr_size( uint lock_flags = 0; uint resblks = 0; bool did_zeroing = false; + struct xfs_zone_alloc_ctx ac = { }; xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); ASSERT(S_ISREG(inode->i_mode)); @@ -856,6 +958,28 @@ xfs_setattr_size( inode_dio_wait(inode); /* + * Normally xfs_zoned_space_reserve is supposed to be called outside the + * IOLOCK. For truncate we can't do that since ->setattr is called with + * it already held by the VFS. So for now chicken out and try to + * allocate space under it. + * + * To avoid deadlocks this means we can't block waiting for space, which + * can lead to spurious -ENOSPC if there are no directly available + * blocks. We mitigate this a bit by allowing zeroing to dip into the + * reserved pool, but eventually the VFS calling convention needs to + * change. + */ + if (xfs_is_zoned_inode(ip)) { + error = xfs_zoned_space_reserve(ip, 1, + XFS_ZR_NOWAIT | XFS_ZR_RESERVED, &ac); + if (error) { + if (error == -EAGAIN) + return -ENOSPC; + return error; + } + } + + /* * File data changes must be complete before we start the transaction to * modify the inode. This needs to be done before joining the inode to * the transaction because the inode cannot be unlocked once it is a @@ -868,11 +992,14 @@ xfs_setattr_size( if (newsize > oldsize) { trace_xfs_zero_eof(ip, oldsize, newsize - oldsize); error = xfs_zero_range(ip, oldsize, newsize - oldsize, - &did_zeroing); + &ac, &did_zeroing); } else { - error = xfs_truncate_page(ip, newsize, &did_zeroing); + error = xfs_truncate_page(ip, newsize, &ac, &did_zeroing); } + if (xfs_is_zoned_inode(ip)) + xfs_zoned_space_unreserve(ip, &ac); + if (error) return error; @@ -1289,6 +1416,7 @@ xfs_setup_inode( { struct inode *inode = &ip->i_vnode; gfp_t gfp_mask; + bool is_meta = xfs_is_internal_inode(ip); inode->i_ino = ip->i_ino; inode->i_state |= I_NEW; @@ -1300,6 +1428,16 @@ xfs_setup_inode( i_size_write(inode, ip->i_disk_size); xfs_diflags_to_iflags(ip, true); + /* + * Mark our metadata files as private so that LSMs and the ACL code + * don't try to add their own metadata or reason about these files, + * and users cannot ever obtain file handles to them. + */ + if (is_meta) { + inode->i_flags |= S_PRIVATE; + inode->i_opflags &= ~IOP_XATTR; + } + if (S_ISDIR(inode->i_mode)) { /* * We set the i_rwsem class here to avoid potential races with |