diff options
Diffstat (limited to 'fs')
164 files changed, 13852 insertions, 1611 deletions
diff --git a/fs/afs/write.c b/fs/afs/write.c index 14d506efd1aa..f865c3f05bea 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -398,8 +398,7 @@ no_more: switch (ret) { case -EDQUOT: case -ENOSPC: - set_bit(AS_ENOSPC, - &wb->vnode->vfs_inode.i_mapping->flags); + mapping_set_error(wb->vnode->vfs_inode.i_mapping, -ENOSPC); break; case -EROFS: case -EIO: @@ -409,7 +408,7 @@ no_more: case -ENOMEDIUM: case -ENXIO: afs_kill_pages(wb->vnode, true, first, last); - set_bit(AS_EIO, &wb->vnode->vfs_inode.i_mapping->flags); + mapping_set_error(wb->vnode->vfs_inode.i_mapping, -EIO); break; case -EACCES: case -EPERM: diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index a439548de785..a1fba4285277 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -20,7 +20,8 @@ #define AUTOFS_IOC_COUNT 32 #define AUTOFS_DEV_IOCTL_IOC_FIRST (AUTOFS_DEV_IOCTL_VERSION) -#define AUTOFS_DEV_IOCTL_IOC_COUNT (AUTOFS_IOC_COUNT - 11) +#define AUTOFS_DEV_IOCTL_IOC_COUNT \ + (AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD - AUTOFS_DEV_IOCTL_VERSION_CMD) #include <linux/kernel.h> #include <linux/slab.h> @@ -33,8 +34,6 @@ #include <asm/current.h> #include <linux/uaccess.h> -/* #define DEBUG */ - #ifdef pr_fmt #undef pr_fmt #endif @@ -111,8 +110,6 @@ struct autofs_sb_info { int max_proto; unsigned long exp_timeout; unsigned int type; - int reghost_enabled; - int needs_reghost; struct super_block *sb; struct mutex wq_mutex; struct mutex pipe_mutex; @@ -271,4 +268,4 @@ static inline void autofs4_del_expiring(struct dentry *dentry) } } -extern void autofs4_kill_sb(struct super_block *); +void autofs4_kill_sb(struct super_block *); diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index c7fcc7438843..fc09eb77ddf3 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -75,7 +75,7 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param) if ((param->ver_major != AUTOFS_DEV_IOCTL_VERSION_MAJOR) || (param->ver_minor > AUTOFS_DEV_IOCTL_VERSION_MINOR)) { pr_warn("ioctl control interface version mismatch: " - "kernel(%u.%u), user(%u.%u), cmd(%d)\n", + "kernel(%u.%u), user(%u.%u), cmd(0x%08x)\n", AUTOFS_DEV_IOCTL_VERSION_MAJOR, AUTOFS_DEV_IOCTL_VERSION_MINOR, param->ver_major, param->ver_minor, cmd); @@ -172,6 +172,17 @@ static struct autofs_sb_info *autofs_dev_ioctl_sbi(struct file *f) return sbi; } +/* Return autofs dev ioctl version */ +static int autofs_dev_ioctl_version(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + /* This should have already been set. */ + param->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR; + param->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR; + return 0; +} + /* Return autofs module protocol version */ static int autofs_dev_ioctl_protover(struct file *fp, struct autofs_sb_info *sbi, @@ -586,41 +597,25 @@ out: static ioctl_fn lookup_dev_ioctl(unsigned int cmd) { - static struct { - int cmd; - ioctl_fn fn; - } _ioctls[] = { - {cmd_idx(AUTOFS_DEV_IOCTL_VERSION_CMD), NULL}, - {cmd_idx(AUTOFS_DEV_IOCTL_PROTOVER_CMD), - autofs_dev_ioctl_protover}, - {cmd_idx(AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD), - autofs_dev_ioctl_protosubver}, - {cmd_idx(AUTOFS_DEV_IOCTL_OPENMOUNT_CMD), - autofs_dev_ioctl_openmount}, - {cmd_idx(AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD), - autofs_dev_ioctl_closemount}, - {cmd_idx(AUTOFS_DEV_IOCTL_READY_CMD), - autofs_dev_ioctl_ready}, - {cmd_idx(AUTOFS_DEV_IOCTL_FAIL_CMD), - autofs_dev_ioctl_fail}, - {cmd_idx(AUTOFS_DEV_IOCTL_SETPIPEFD_CMD), - autofs_dev_ioctl_setpipefd}, - {cmd_idx(AUTOFS_DEV_IOCTL_CATATONIC_CMD), - autofs_dev_ioctl_catatonic}, - {cmd_idx(AUTOFS_DEV_IOCTL_TIMEOUT_CMD), - autofs_dev_ioctl_timeout}, - {cmd_idx(AUTOFS_DEV_IOCTL_REQUESTER_CMD), - autofs_dev_ioctl_requester}, - {cmd_idx(AUTOFS_DEV_IOCTL_EXPIRE_CMD), - autofs_dev_ioctl_expire}, - {cmd_idx(AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD), - autofs_dev_ioctl_askumount}, - {cmd_idx(AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD), - autofs_dev_ioctl_ismountpoint} + static ioctl_fn _ioctls[] = { + autofs_dev_ioctl_version, + autofs_dev_ioctl_protover, + autofs_dev_ioctl_protosubver, + autofs_dev_ioctl_openmount, + autofs_dev_ioctl_closemount, + autofs_dev_ioctl_ready, + autofs_dev_ioctl_fail, + autofs_dev_ioctl_setpipefd, + autofs_dev_ioctl_catatonic, + autofs_dev_ioctl_timeout, + autofs_dev_ioctl_requester, + autofs_dev_ioctl_expire, + autofs_dev_ioctl_askumount, + autofs_dev_ioctl_ismountpoint, }; unsigned int idx = cmd_idx(cmd); - return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx].fn; + return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx]; } /* ioctl dispatcher */ @@ -642,7 +637,7 @@ static int _autofs_dev_ioctl(unsigned int command, cmd = _IOC_NR(command); if (_IOC_TYPE(command) != _IOC_TYPE(AUTOFS_DEV_IOCTL_IOC_FIRST) || - cmd - cmd_first >= AUTOFS_DEV_IOCTL_IOC_COUNT) { + cmd - cmd_first > AUTOFS_DEV_IOCTL_IOC_COUNT) { return -ENOTTY; } @@ -655,14 +650,11 @@ static int _autofs_dev_ioctl(unsigned int command, if (err) goto out; - /* The validate routine above always sets the version */ - if (cmd == AUTOFS_DEV_IOCTL_VERSION_CMD) - goto done; - fn = lookup_dev_ioctl(cmd); if (!fn) { pr_warn("unknown command 0x%08x\n", command); - return -ENOTTY; + err = -ENOTTY; + goto out; } fp = NULL; @@ -671,9 +663,11 @@ static int _autofs_dev_ioctl(unsigned int command, /* * For obvious reasons the openmount can't have a file * descriptor yet. We don't take a reference to the - * file during close to allow for immediate release. + * file during close to allow for immediate release, + * and the same for retrieving ioctl version. */ - if (cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD && + if (cmd != AUTOFS_DEV_IOCTL_VERSION_CMD && + cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD && cmd != AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD) { fp = fget(param->ioctlfd); if (!fp) { @@ -706,7 +700,6 @@ cont: if (fp) fput(fp); -done: if (err >= 0 && copy_to_user(user, param, AUTOFS_DEV_IOCTL_SIZE)) err = -EFAULT; out: diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index ca9cbd6362e0..438b5bf675b6 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -274,6 +274,23 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) goto fail_dput; } + /* Test versions first */ + if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION || + sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) { + pr_err("kernel does not match daemon version " + "daemon (%d, %d) kernel (%d, %d)\n", + sbi->min_proto, sbi->max_proto, + AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION); + goto fail_dput; + } + + /* Establish highest kernel protocol version */ + if (sbi->max_proto > AUTOFS_MAX_PROTO_VERSION) + sbi->version = AUTOFS_MAX_PROTO_VERSION; + else + sbi->version = sbi->max_proto; + sbi->sub_version = AUTOFS_PROTO_SUBVERSION; + if (pgrp_set) { sbi->oz_pgrp = find_get_pid(pgrp); if (!sbi->oz_pgrp) { @@ -291,29 +308,12 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) root_inode->i_fop = &autofs4_root_operations; root_inode->i_op = &autofs4_dir_inode_operations; - /* Couldn't this be tested earlier? */ - if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION || - sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) { - pr_err("kernel does not match daemon version " - "daemon (%d, %d) kernel (%d, %d)\n", - sbi->min_proto, sbi->max_proto, - AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION); - goto fail_dput; - } - - /* Establish highest kernel protocol version */ - if (sbi->max_proto > AUTOFS_MAX_PROTO_VERSION) - sbi->version = AUTOFS_MAX_PROTO_VERSION; - else - sbi->version = sbi->max_proto; - sbi->sub_version = AUTOFS_PROTO_SUBVERSION; - pr_debug("pipe fd = %d, pgrp = %u\n", pipefd, pid_nr(sbi->oz_pgrp)); pipe = fget(pipefd); if (!pipe) { pr_err("could not open pipe file descriptor\n"); - goto fail_dput; + goto fail_put_pid; } ret = autofs_prepare_pipe(pipe); if (ret < 0) @@ -334,14 +334,14 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) fail_fput: pr_err("pipe file descriptor does not contain proper ops\n"); fput(pipe); - /* fall through */ +fail_put_pid: + put_pid(sbi->oz_pgrp); fail_dput: dput(root); goto fail_free; fail_ino: - kfree(ino); + autofs4_free_ino(ino); fail_free: - put_pid(sbi->oz_pgrp); kfree(sbi); s->s_fs_info = NULL; return ret; @@ -368,7 +368,8 @@ struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode) inode->i_fop = &autofs4_dir_operations; } else if (S_ISLNK(mode)) { inode->i_op = &autofs4_symlink_inode_operations; - } + } else + WARN_ON(1); return inode; } diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 623510e84c96..a11f73174877 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -577,8 +577,6 @@ static int autofs4_dir_symlink(struct inode *dir, inode = autofs4_get_inode(dir->i_sb, S_IFLNK | 0555); if (!inode) { kfree(cp); - if (!dentry->d_fsdata) - kfree(ino); return -ENOMEM; } inode->i_private = cp; @@ -842,7 +840,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p) if (may_umount(mnt)) status = 1; - pr_debug("returning %d\n", status); + pr_debug("may umount %d\n", status); status = put_user(status, p); diff --git a/fs/block_dev.c b/fs/block_dev.c index 376e4e426324..05b553368bb4 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -30,6 +30,7 @@ #include <linux/cleancache.h> #include <linux/dax.h> #include <linux/badblocks.h> +#include <linux/falloc.h> #include <asm/uaccess.h> #include "internal.h" @@ -1775,6 +1776,81 @@ static const struct address_space_operations def_blk_aops = { .is_dirty_writeback = buffer_check_dirty_writeback, }; +#define BLKDEV_FALLOC_FL_SUPPORTED \ + (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ + FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE) + +static long blkdev_fallocate(struct file *file, int mode, loff_t start, + loff_t len) +{ + struct block_device *bdev = I_BDEV(bdev_file_inode(file)); + struct request_queue *q = bdev_get_queue(bdev); + struct address_space *mapping; + loff_t end = start + len - 1; + loff_t isize; + int error; + + /* Fail if we don't recognize the flags. */ + if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED) + return -EOPNOTSUPP; + + /* Don't go off the end of the device. */ + isize = i_size_read(bdev->bd_inode); + if (start >= isize) + return -EINVAL; + if (end >= isize) { + if (mode & FALLOC_FL_KEEP_SIZE) { + len = isize - start; + end = start + len - 1; + } else + return -EINVAL; + } + + /* + * Don't allow IO that isn't aligned to logical block size. + */ + if ((start | len) & (bdev_logical_block_size(bdev) - 1)) + return -EINVAL; + + /* Invalidate the page cache, including dirty pages. */ + mapping = bdev->bd_inode->i_mapping; + truncate_inode_pages_range(mapping, start, end); + + switch (mode) { + case FALLOC_FL_ZERO_RANGE: + case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: + error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, + GFP_KERNEL, false); + break; + case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: + /* Only punch if the device can do zeroing discard. */ + if (!blk_queue_discard(q) || !q->limits.discard_zeroes_data) + return -EOPNOTSUPP; + error = blkdev_issue_discard(bdev, start >> 9, len >> 9, + GFP_KERNEL, 0); + break; + case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE: + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + error = blkdev_issue_discard(bdev, start >> 9, len >> 9, + GFP_KERNEL, 0); + break; + default: + return -EOPNOTSUPP; + } + if (error) + return error; + + /* + * Invalidate again; if someone wandered in and dirtied a page, + * the caller will be given -EBUSY. The third argument is + * inclusive, so the rounding here is safe. + */ + return invalidate_inode_pages2_range(mapping, + start >> PAGE_SHIFT, + end >> PAGE_SHIFT); +} + const struct file_operations def_blk_fops = { .open = blkdev_open, .release = blkdev_close, @@ -1789,6 +1865,7 @@ const struct file_operations def_blk_fops = { #endif .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, + .fallocate = blkdev_fallocate, }; int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6c21bad26a27..0b8ce2b9f7d0 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -252,7 +252,8 @@ struct btrfs_super_block { #define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP \ - (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE) + (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \ + BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID) #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e720d3e6ec20..3a57f99d96aa 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2586,6 +2586,7 @@ int open_ctree(struct super_block *sb, int num_backups_tried = 0; int backup_index = 0; int max_active; + int clear_free_space_tree = 0; tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL); chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL); @@ -3148,6 +3149,26 @@ retry_root_backup: if (sb->s_flags & MS_RDONLY) return 0; + if (btrfs_test_opt(fs_info, CLEAR_CACHE) && + btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + clear_free_space_tree = 1; + } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && + !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) { + btrfs_warn(fs_info, "free space tree is invalid"); + clear_free_space_tree = 1; + } + + if (clear_free_space_tree) { + btrfs_info(fs_info, "clearing free space tree"); + ret = btrfs_clear_free_space_tree(fs_info); + if (ret) { + btrfs_warn(fs_info, + "failed to clear free space tree: %d", ret); + close_ctree(tree_root); + return ret; + } + } + if (btrfs_test_opt(tree_root->fs_info, FREE_SPACE_TREE) && !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { btrfs_info(fs_info, "creating free space tree"); @@ -3185,18 +3206,6 @@ retry_root_backup: btrfs_qgroup_rescan_resume(fs_info); - if (btrfs_test_opt(tree_root->fs_info, CLEAR_CACHE) && - btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - btrfs_info(fs_info, "clearing free space tree"); - ret = btrfs_clear_free_space_tree(fs_info); - if (ret) { - btrfs_warn(fs_info, - "failed to clear free space tree: %d", ret); - close_ctree(tree_root); - return ret; - } - } - if (!fs_info->uuid_root) { btrfs_info(fs_info, "creating UUID tree"); ret = btrfs_create_uuid_tree(fs_info); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ee40384c394d..66a755150056 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5558,17 +5558,45 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, } } -/* - * The extent buffer bitmap operations are done with byte granularity because - * bitmap items are not guaranteed to be aligned to a word and therefore a - * single word in a bitmap may straddle two pages in the extent buffer. - */ -#define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE) -#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1) -#define BITMAP_FIRST_BYTE_MASK(start) \ - ((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK) -#define BITMAP_LAST_BYTE_MASK(nbits) \ - (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) +void le_bitmap_set(u8 *map, unsigned int start, int len) +{ + u8 *p = map + BIT_BYTE(start); + const unsigned int size = start + len; + int bits_to_set = BITS_PER_BYTE - (start % BITS_PER_BYTE); + u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(start); + + while (len - bits_to_set >= 0) { + *p |= mask_to_set; + len -= bits_to_set; + bits_to_set = BITS_PER_BYTE; + mask_to_set = ~(u8)0; + p++; + } + if (len) { + mask_to_set &= BITMAP_LAST_BYTE_MASK(size); + *p |= mask_to_set; + } +} + +void le_bitmap_clear(u8 *map, unsigned int start, int len) +{ + u8 *p = map + BIT_BYTE(start); + const unsigned int size = start + len; + int bits_to_clear = BITS_PER_BYTE - (start % BITS_PER_BYTE); + u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(start); + + while (len - bits_to_clear >= 0) { + *p &= ~mask_to_clear; + len -= bits_to_clear; + bits_to_clear = BITS_PER_BYTE; + mask_to_clear = ~(u8)0; + p++; + } + if (len) { + mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); + *p &= ~mask_to_clear; + } +} /* * eb_bitmap_offset() - calculate the page and offset of the byte containing the @@ -5612,7 +5640,7 @@ static inline void eb_bitmap_offset(struct extent_buffer *eb, int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, unsigned long nr) { - char *kaddr; + u8 *kaddr; struct page *page; unsigned long i; size_t offset; @@ -5634,13 +5662,13 @@ int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len) { - char *kaddr; + u8 *kaddr; struct page *page; unsigned long i; size_t offset; const unsigned int size = pos + len; int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE); - unsigned int mask_to_set = BITMAP_FIRST_BYTE_MASK(pos); + u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos); eb_bitmap_offset(eb, start, pos, &i, &offset); page = eb->pages[i]; @@ -5651,7 +5679,7 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, kaddr[offset] |= mask_to_set; len -= bits_to_set; bits_to_set = BITS_PER_BYTE; - mask_to_set = ~0U; + mask_to_set = ~(u8)0; if (++offset >= PAGE_SIZE && len > 0) { offset = 0; page = eb->pages[++i]; @@ -5676,13 +5704,13 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len) { - char *kaddr; + u8 *kaddr; struct page *page; unsigned long i; size_t offset; const unsigned int size = pos + len; int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE); - unsigned int mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos); + u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos); eb_bitmap_offset(eb, start, pos, &i, &offset); page = eb->pages[i]; @@ -5693,7 +5721,7 @@ void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, kaddr[offset] &= ~mask_to_clear; len -= bits_to_clear; bits_to_clear = BITS_PER_BYTE; - mask_to_clear = ~0U; + mask_to_clear = ~(u8)0; if (++offset >= PAGE_SIZE && len > 0) { offset = 0; page = eb->pages[++i]; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 4a094f1dc7ef..ab31d145227e 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -59,6 +59,28 @@ */ #define EXTENT_PAGE_PRIVATE 1 +/* + * The extent buffer bitmap operations are done with byte granularity instead of + * word granularity for two reasons: + * 1. The bitmaps must be little-endian on disk. + * 2. Bitmap items are not guaranteed to be aligned to a word and therefore a + * single word in a bitmap may straddle two pages in the extent buffer. + */ +#define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE) +#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1) +#define BITMAP_FIRST_BYTE_MASK(start) \ + ((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK) +#define BITMAP_LAST_BYTE_MASK(nbits) \ + (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) + +static inline int le_test_bit(int nr, const u8 *addr) +{ + return 1U & (addr[BIT_BYTE(nr)] >> (nr & (BITS_PER_BYTE-1))); +} + +extern void le_bitmap_set(u8 *map, unsigned int start, int len); +extern void le_bitmap_clear(u8 *map, unsigned int start, int len); + struct extent_state; struct btrfs_root; struct btrfs_io_bio; diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index e4a42a8e4f84..57401b474ec6 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -151,7 +151,7 @@ static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize) return DIV_ROUND_UP((u32)div_u64(size, sectorsize), BITS_PER_BYTE); } -static unsigned long *alloc_bitmap(u32 bitmap_size) +static u8 *alloc_bitmap(u32 bitmap_size) { void *mem; @@ -180,8 +180,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, struct btrfs_free_space_info *info; struct btrfs_key key, found_key; struct extent_buffer *leaf; - unsigned long *bitmap; - char *bitmap_cursor; + u8 *bitmap, *bitmap_cursor; u64 start, end; u64 bitmap_range, i; u32 bitmap_size, flags, expected_extent_count; @@ -231,7 +230,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, block_group->sectorsize); last = div_u64(found_key.objectid + found_key.offset - start, block_group->sectorsize); - bitmap_set(bitmap, first, last - first); + le_bitmap_set(bitmap, first, last - first); extent_count++; nr++; @@ -270,7 +269,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, goto out; } - bitmap_cursor = (char *)bitmap; + bitmap_cursor = bitmap; bitmap_range = block_group->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS; i = start; while (i < end) { @@ -319,7 +318,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, struct btrfs_free_space_info *info; struct btrfs_key key, found_key; struct extent_buffer *leaf; - unsigned long *bitmap; + u8 *bitmap; u64 start, end; /* Initialize to silence GCC. */ u64 extent_start = 0; @@ -363,7 +362,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, break; } else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) { unsigned long ptr; - char *bitmap_cursor; + u8 *bitmap_cursor; u32 bitmap_pos, data_size; ASSERT(found_key.objectid >= start); @@ -373,7 +372,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, bitmap_pos = div_u64(found_key.objectid - start, block_group->sectorsize * BITS_PER_BYTE); - bitmap_cursor = ((char *)bitmap) + bitmap_pos; + bitmap_cursor = bitmap + bitmap_pos; data_size = free_space_bitmap_size(found_key.offset, block_group->sectorsize); @@ -410,7 +409,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, offset = start; bitnr = 0; while (offset < end) { - bit = !!test_bit(bitnr, bitmap); + bit = !!le_test_bit(bitnr, bitmap); if (prev_bit == 0 && bit == 1) { extent_start = offset; } else if (prev_bit == 1 && bit == 0) { @@ -1185,6 +1184,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) } btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE); + btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); ret = btrfs_commit_transaction(trans, tree_root); @@ -1253,6 +1253,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) return PTR_ERR(trans); btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE); + btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); fs_info->free_space_root = NULL; ret = clear_free_space_tree(trans, free_space_root); diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index d19ab0317283..caad80bb9bd0 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -273,20 +273,37 @@ out: return ret; } -/** - * test_bit_in_byte - Determine whether a bit is set in a byte - * @nr: bit number to test - * @addr: Address to start counting from - */ -static inline int test_bit_in_byte(int nr, const u8 *addr) +static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb, + unsigned long len) { - return 1UL & (addr[nr / BITS_PER_BYTE] >> (nr & (BITS_PER_BYTE - 1))); + unsigned long i; + + for (i = 0; i < len * BITS_PER_BYTE; i++) { + int bit, bit1; + + bit = !!test_bit(i, bitmap); + bit1 = !!extent_buffer_test_bit(eb, 0, i); + if (bit1 != bit) { + test_msg("Bits do not match\n"); + return -EINVAL; + } + + bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE, + i % BITS_PER_BYTE); + if (bit1 != bit) { + test_msg("Offset bits do not match\n"); + return -EINVAL; + } + } + return 0; } static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, unsigned long len) { - unsigned long i, x; + unsigned long i, j; + u32 x; + int ret; memset(bitmap, 0, len); memset_extent_buffer(eb, 0, 0, len); @@ -297,16 +314,18 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, bitmap_set(bitmap, 0, len * BITS_PER_BYTE); extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); - if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + ret = check_eb_bitmap(bitmap, eb, len); + if (ret) { test_msg("Setting all bits failed\n"); - return -EINVAL; + return ret; } bitmap_clear(bitmap, 0, len * BITS_PER_BYTE); extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE); - if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + ret = check_eb_bitmap(bitmap, eb, len); + if (ret) { test_msg("Clearing all bits failed\n"); - return -EINVAL; + return ret; } /* Straddling pages test */ @@ -316,9 +335,10 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, sizeof(long) * BITS_PER_BYTE); extent_buffer_bitmap_set(eb, PAGE_SIZE - sizeof(long) / 2, 0, sizeof(long) * BITS_PER_BYTE); - if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + ret = check_eb_bitmap(bitmap, eb, len); + if (ret) { test_msg("Setting straddling pages failed\n"); - return -EINVAL; + return ret; } bitmap_set(bitmap, 0, len * BITS_PER_BYTE); @@ -328,9 +348,10 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); extent_buffer_bitmap_clear(eb, PAGE_SIZE - sizeof(long) / 2, 0, sizeof(long) * BITS_PER_BYTE); - if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + ret = check_eb_bitmap(bitmap, eb, len); + if (ret) { test_msg("Clearing straddling pages failed\n"); - return -EINVAL; + return ret; } } @@ -339,28 +360,22 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, * something repetitive that could miss some hypothetical off-by-n bug. */ x = 0; - for (i = 0; i < len / sizeof(long); i++) { - x = (0x19660dULL * (u64)x + 0x3c6ef35fULL) & 0xffffffffUL; - bitmap[i] = x; - } - write_extent_buffer(eb, bitmap, 0, len); - - for (i = 0; i < len * BITS_PER_BYTE; i++) { - int bit, bit1; - - bit = !!test_bit_in_byte(i, (u8 *)bitmap); - bit1 = !!extent_buffer_test_bit(eb, 0, i); - if (bit1 != bit) { - test_msg("Testing bit pattern failed\n"); - return -EINVAL; + bitmap_clear(bitmap, 0, len * BITS_PER_BYTE); + extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE); + for (i = 0; i < len * BITS_PER_BYTE / 32; i++) { + x = (0x19660dULL * (u64)x + 0x3c6ef35fULL) & 0xffffffffU; + for (j = 0; j < 32; j++) { + if (x & (1U << j)) { + bitmap_set(bitmap, i * 32 + j, 1); + extent_buffer_bitmap_set(eb, 0, i * 32 + j, 1); + } } + } - bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE, - i % BITS_PER_BYTE); - if (bit1 != bit) { - test_msg("Testing bit pattern with offset failed\n"); - return -EINVAL; - } + ret = check_eb_bitmap(bitmap, eb, len); + if (ret) { + test_msg("Random bit pattern failed\n"); + return ret; } return 0; diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 7508d3b42780..6e144048a72e 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -24,20 +24,15 @@ #include "../transaction.h" struct free_space_extent { - u64 start, length; + u64 start; + u64 length; }; -/* - * The test cases align their operations to this in order to hit some of the - * edge cases in the bitmap code. - */ -#define BITMAP_RANGE (BTRFS_FREE_SPACE_BITMAP_BITS * PAGE_SIZE) - static int __check_free_space_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, struct btrfs_path *path, - struct free_space_extent *extents, + const struct free_space_extent * const extents, unsigned int num_extents) { struct btrfs_free_space_info *info; @@ -126,7 +121,7 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, struct btrfs_path *path, - struct free_space_extent *extents, + const struct free_space_extent * const extents, unsigned int num_extents) { struct btrfs_free_space_info *info; @@ -168,9 +163,10 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans, static int test_empty_block_group(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, - struct btrfs_path *path) + struct btrfs_path *path, + u32 alignment) { - struct free_space_extent extents[] = { + const struct free_space_extent extents[] = { {cache->key.objectid, cache->key.offset}, }; @@ -181,9 +177,10 @@ static int test_empty_block_group(struct btrfs_trans_handle *trans, static int test_remove_all(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, - struct btrfs_path *path) + struct btrfs_path *path, + u32 alignment) { - struct free_space_extent extents[] = {}; + const struct free_space_extent extents[] = {}; int ret; ret = __remove_from_free_space_tree(trans, fs_info, cache, path, @@ -201,16 +198,17 @@ static int test_remove_all(struct btrfs_trans_handle *trans, static int test_remove_beginning(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, - struct btrfs_path *path) + struct btrfs_path *path, + u32 alignment) { - struct free_space_extent extents[] = { - {cache->key.objectid + BITMAP_RANGE, - cache->key.offset - BITMAP_RANGE}, + const struct free_space_extent extents[] = { + {cache->key.objectid + alignment, + cache->key.offset - alignment}, }; int ret; ret = __remove_from_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid, BITMAP_RANGE); + cache->key.objectid, alignment); if (ret) { test_msg("Could not remove free space\n"); return ret; @@ -224,17 +222,18 @@ static int test_remove_beginning(struct btrfs_trans_handle *trans, static int test_remove_end(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, - struct btrfs_path *path) + struct btrfs_path *path, + u32 alignment) { - struct free_space_extent extents[] = { - {cache->key.objectid, cache->key.offset - BITMAP_RANGE}, + const struct free_space_extent extents[] = { + {cache->key.objectid, cache->key.offset - alignment}, }; int ret; ret = __remove_from_free_space_tree(trans, fs_info, cache, path, cache->key.objectid + - cache->key.offset - BITMAP_RANGE, - BITMAP_RANGE); + cache->key.offset - alignment, + alignment); if (ret) { test_msg("Could not remove free space\n"); return ret; @@ -247,18 +246,19 @@ static int test_remove_end(struct btrfs_trans_handle *trans, static int test_remove_middle(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, - struct btrfs_path *path) + struct btrfs_path *path, + u32 alignment) { - struct free_space_extent extents[] = { - {cache->key.objectid, BITMAP_RANGE}, - {cache->key.objectid + 2 * BITMAP_RANGE, - cache->key.offset - 2 * BITMAP_RANGE}, + const struct free_space_extent extents[] = { + {cache->key.objectid, alignment}, + {cache->key.objectid + 2 * alignment, + cache->key.offset - 2 * alignment}, }; int ret; ret = __remove_from_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid + BITMAP_RANGE, - BITMAP_RANGE); + cache->key.objectid + alignment, + alignment); if (ret) { test_msg("Could not remove free space\n"); return ret; @@ -271,10 +271,11 @@ static int test_remove_middle(struct btrfs_trans_handle *trans, static int test_merge_left(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, - struct btrfs_path *path) + struct btrfs_path *path, + u32 alignment) { - struct free_space_extent extents[] = { - {cache->key.objectid, 2 * BITMAP_RANGE}, + const struct free_space_extent extents[] = { + {cache->key.objectid, 2 * alignment}, }; int ret; @@ -287,15 +288,15 @@ static int test_merge_left(struct btrfs_trans_handle *trans, } ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid, BITMAP_RANGE); + cache->key.objectid, alignment); if (ret) { test_msg("Could not add free space\n"); return ret; } ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid + BITMAP_RANGE, - BITMAP_RANGE); + cache->key.objectid + alignment, + alignment); if (ret) { test_msg("Could not add free space\n"); return ret; @@ -308,10 +309,11 @@ static int test_merge_left(struct btrfs_trans_handle *trans, static int test_merge_right(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, - struct btrfs_path *path) + struct btrfs_path *path, + u32 alignment) { - struct free_space_extent extents[] = { - {cache->key.objectid + BITMAP_RANGE, 2 * BITMAP_RANGE}, + const struct free_space_extent extents[] = { + {cache->key.objectid + alignment, 2 * alignment}, }; int ret; @@ -324,16 +326,16 @@ static int test_merge_right(struct btrfs_trans_handle *trans, } ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid + 2 * BITMAP_RANGE, - BITMAP_RANGE); + cache->key.objectid + 2 * alignment, + alignment); if (ret) { test_msg("Could not add free space\n"); return ret; } ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid + BITMAP_RANGE, - BITMAP_RANGE); + cache->key.objectid + alignment, + alignment); if (ret) { test_msg("Could not add free space\n"); return ret; @@ -346,10 +348,11 @@ static int test_merge_right(struct btrfs_trans_handle *trans, static int test_merge_both(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, - struct btrfs_path *path) + struct btrfs_path *path, + u32 alignment) { - struct free_space_extent extents[] = { - {cache->key.objectid, 3 * BITMAP_RANGE}, + const struct free_space_extent extents[] = { + {cache->key.objectid, 3 * alignment}, }; int ret; @@ -362,23 +365,23 @@ static int test_merge_both(struct btrfs_trans_handle *trans, } ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid, BITMAP_RANGE); + cache->key.objectid, alignment); if (ret) { test_msg("Could not add free space\n"); return ret; } ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid + 2 * BITMAP_RANGE, - BITMAP_RANGE); + cache->key.objectid + 2 * alignment, + alignment); if (ret) { test_msg("Could not add free space\n"); return ret; } ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid + BITMAP_RANGE, - BITMAP_RANGE); + cache->key.objectid + alignment, + alignment); if (ret) { test_msg("Could not add free space\n"); return ret; @@ -391,12 +394,13 @@ static int test_merge_both(struct btrfs_trans_handle *trans, static int test_merge_none(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, - struct btrfs_path *path) + struct btrfs_path *path, + u32 alignment) { - struct free_space_extent extents[] = { - {cache->key.objectid, BITMAP_RANGE}, - {cache->key.objectid + 2 * BITMAP_RANGE, BITMAP_RANGE}, - {cache->key.objectid + 4 * BITMAP_RANGE, BITMAP_RANGE}, + const struct free_space_extent extents[] = { + {cache->key.objectid, alignment}, + {cache->key.objectid + 2 * alignment, alignment}, + {cache->key.objectid + 4 * alignment, alignment}, }; int ret; @@ -409,23 +413,23 @@ static int test_merge_none(struct btrfs_trans_handle *trans, } ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid, BITMAP_RANGE); + cache->key.objectid, alignment); if (ret) { test_msg("Could not add free space\n"); return ret; } ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid + 4 * BITMAP_RANGE, - BITMAP_RANGE); + cache->key.objectid + 4 * alignment, + alignment); if (ret) { test_msg("Could not add free space\n"); return ret; } ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid + 2 * BITMAP_RANGE, - BITMAP_RANGE); + cache->key.objectid + 2 * alignment, + alignment); if (ret) { test_msg("Could not add free space\n"); return ret; @@ -438,10 +442,11 @@ static int test_merge_none(struct btrfs_trans_handle *trans, typedef int (*test_func_t)(struct btrfs_trans_handle *, struct btrfs_fs_info *, struct btrfs_block_group_cache *, - struct btrfs_path *); + struct btrfs_path *, + u32 alignment); -static int run_test(test_func_t test_func, int bitmaps, - u32 sectorsize, u32 nodesize) +static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, + u32 nodesize, u32 alignment) { struct btrfs_fs_info *fs_info; struct btrfs_root *root = NULL; @@ -480,7 +485,7 @@ static int run_test(test_func_t test_func, int bitmaps, btrfs_set_header_nritems(root->node, 0); root->alloc_bytenr += 2 * nodesize; - cache = btrfs_alloc_dummy_block_group(8 * BITMAP_RANGE, sectorsize); + cache = btrfs_alloc_dummy_block_group(8 * alignment, sectorsize); if (!cache) { test_msg("Couldn't allocate dummy block group cache\n"); ret = -ENOMEM; @@ -514,7 +519,7 @@ static int run_test(test_func_t test_func, int bitmaps, } } - ret = test_func(&trans, root->fs_info, cache, path); + ret = test_func(&trans, root->fs_info, cache, path, alignment); if (ret) goto out; @@ -539,15 +544,27 @@ out: return ret; } -static int run_test_both_formats(test_func_t test_func, - u32 sectorsize, u32 nodesize) +static int run_test_both_formats(test_func_t test_func, u32 sectorsize, + u32 nodesize, u32 alignment) { + int test_ret = 0; int ret; - ret = run_test(test_func, 0, sectorsize, nodesize); - if (ret) - return ret; - return run_test(test_func, 1, sectorsize, nodesize); + ret = run_test(test_func, 0, sectorsize, nodesize, alignment); + if (ret) { + test_msg("%pf failed with extents, sectorsize=%u, nodesize=%u, alignment=%u\n", + test_func, sectorsize, nodesize, alignment); + test_ret = ret; + } + + ret = run_test(test_func, 1, sectorsize, nodesize, alignment); + if (ret) { + test_msg("%pf failed with bitmaps, sectorsize=%u, nodesize=%u, alignment=%u\n", + test_func, sectorsize, nodesize, alignment); + test_ret = ret; + } + + return test_ret; } int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize) @@ -563,18 +580,30 @@ int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize) test_merge_both, test_merge_none, }; + u32 bitmap_alignment; + int test_ret = 0; int i; + /* + * Align some operations to a page to flush out bugs in the extent + * buffer bitmap handling of highmem. + */ + bitmap_alignment = BTRFS_FREE_SPACE_BITMAP_BITS * PAGE_SIZE; + test_msg("Running free space tree tests\n"); for (i = 0; i < ARRAY_SIZE(tests); i++) { - int ret = run_test_both_formats(tests[i], sectorsize, - nodesize); - if (ret) { - test_msg("%pf : sectorsize %u failed\n", - tests[i], sectorsize); - return ret; - } + int ret; + + ret = run_test_both_formats(tests[i], sectorsize, nodesize, + sectorsize); + if (ret) + test_ret = ret; + + ret = run_test_both_formats(tests[i], sectorsize, nodesize, + bitmap_alignment); + if (ret) + test_ret = ret; } - return 0; + return test_ret; } diff --git a/fs/buffer.c b/fs/buffer.c index 7dad8713fac8..b205a629001d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -351,7 +351,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate) set_buffer_uptodate(bh); } else { buffer_io_error(bh, ", lost async page write"); - set_bit(AS_EIO, &page->mapping->flags); + mapping_set_error(page->mapping, -EIO); set_buffer_write_io_error(bh); clear_buffer_uptodate(bh); SetPageError(page); @@ -3249,7 +3249,7 @@ drop_buffers(struct page *page, struct buffer_head **buffers_to_free) bh = head; do { if (buffer_write_io_error(bh) && page->mapping) - set_bit(AS_EIO, &page->mapping->flags); + mapping_set_error(page->mapping, -EIO); if (buffer_busy(bh)) goto failed; bh = bh->b_this_page; diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 6c58e13fed2f..3d03e48a9213 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -152,6 +152,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) list_for_each(tmp1, &cifs_tcp_ses_list) { server = list_entry(tmp1, struct TCP_Server_Info, tcp_ses_list); + seq_printf(m, "\nNumber of credits: %d", server->credits); i++; list_for_each(tmp2, &server->smb_ses_list) { ses = list_entry(tmp2, struct cifs_ses, diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 1418daa03d95..07ed81cf1552 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -49,6 +49,7 @@ #define CIFS_MOUNT_USE_PREFIX_PATH 0x1000000 /* make subpath with unaccessible * root mountable */ +#define CIFS_MOUNT_UID_FROM_ACL 0x2000000 /* try to get UID via special SID */ struct cifs_sb_info { struct rb_root tlink_tree; diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index 0065256881d8..57ff0756e30c 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -36,7 +36,15 @@ struct smb_mnt_fs_info { __u64 cifs_posix_caps; } __packed; +struct smb_snapshot_array { + __u32 number_of_snapshots; + __u32 number_of_snapshots_returned; + __u32 snapshot_array_size; + /* snapshots[]; */ +} __packed; + #define CIFS_IOCTL_MAGIC 0xCF #define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int) #define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4) #define CIFS_IOC_GET_MNT_INFO _IOR(CIFS_IOCTL_MAGIC, 5, struct smb_mnt_fs_info) +#define CIFS_ENUMERATE_SNAPSHOTS _IOR(CIFS_IOCTL_MAGIC, 6, struct smb_snapshot_array) diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 71e8a56e9479..15bac390dff9 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -42,6 +42,35 @@ static const struct cifs_sid sid_authusers = { /* group users */ static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} }; +/* S-1-22-1 Unmapped Unix users */ +static const struct cifs_sid sid_unix_users = {1, 1, {0, 0, 0, 0, 0, 22}, + {cpu_to_le32(1), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; + +/* S-1-22-2 Unmapped Unix groups */ +static const struct cifs_sid sid_unix_groups = { 1, 1, {0, 0, 0, 0, 0, 22}, + {cpu_to_le32(2), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; + +/* + * See http://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx + */ + +/* S-1-5-88 MS NFS and Apple style UID/GID/mode */ + +/* S-1-5-88-1 Unix uid */ +static const struct cifs_sid sid_unix_NFS_users = { 1, 2, {0, 0, 0, 0, 0, 5}, + {cpu_to_le32(88), + cpu_to_le32(1), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; + +/* S-1-5-88-2 Unix gid */ +static const struct cifs_sid sid_unix_NFS_groups = { 1, 2, {0, 0, 0, 0, 0, 5}, + {cpu_to_le32(88), + cpu_to_le32(2), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; + +/* S-1-5-88-3 Unix mode */ +static const struct cifs_sid sid_unix_NFS_mode = { 1, 2, {0, 0, 0, 0, 0, 5}, + {cpu_to_le32(88), + cpu_to_le32(3), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; + static const struct cred *root_cred; static int @@ -183,6 +212,62 @@ compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid) return 0; /* sids compare/match */ } +static bool +is_well_known_sid(const struct cifs_sid *psid, uint32_t *puid, bool is_group) +{ + int i; + int num_subauth; + const struct cifs_sid *pwell_known_sid; + + if (!psid || (puid == NULL)) + return false; + + num_subauth = psid->num_subauth; + + /* check if Mac (or Windows NFS) vs. Samba format for Unix owner SID */ + if (num_subauth == 2) { + if (is_group) + pwell_known_sid = &sid_unix_groups; + else + pwell_known_sid = &sid_unix_users; + } else if (num_subauth == 3) { + if (is_group) + pwell_known_sid = &sid_unix_NFS_groups; + else + pwell_known_sid = &sid_unix_NFS_users; + } else + return false; + + /* compare the revision */ + if (psid->revision != pwell_known_sid->revision) + return false; + + /* compare all of the six auth values */ + for (i = 0; i < NUM_AUTHS; ++i) { + if (psid->authority[i] != pwell_known_sid->authority[i]) { + cifs_dbg(FYI, "auth %d did not match\n", i); + return false; + } + } + + if (num_subauth == 2) { + if (psid->sub_auth[0] != pwell_known_sid->sub_auth[0]) + return false; + + *puid = le32_to_cpu(psid->sub_auth[1]); + } else /* 3 subauths, ie Windows/Mac style */ { + *puid = le32_to_cpu(psid->sub_auth[0]); + if ((psid->sub_auth[0] != pwell_known_sid->sub_auth[0]) || + (psid->sub_auth[1] != pwell_known_sid->sub_auth[1])) + return false; + + *puid = le32_to_cpu(psid->sub_auth[2]); + } + + cifs_dbg(FYI, "Unix UID %d returned from SID\n", *puid); + return true; /* well known sid found, uid returned */ +} + static void cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src) { @@ -276,6 +361,43 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, return -EIO; } + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL) { + uint32_t unix_id; + bool is_group; + + if (sidtype != SIDOWNER) + is_group = true; + else + is_group = false; + + if (is_well_known_sid(psid, &unix_id, is_group) == false) + goto try_upcall_to_get_id; + + if (is_group) { + kgid_t gid; + gid_t id; + + id = (gid_t)unix_id; + gid = make_kgid(&init_user_ns, id); + if (gid_valid(gid)) { + fgid = gid; + goto got_valid_id; + } + } else { + kuid_t uid; + uid_t id; + + id = (uid_t)unix_id; + uid = make_kuid(&init_user_ns, id); + if (uid_valid(uid)) { + fuid = uid; + goto got_valid_id; + } + } + /* If unable to find uid/gid easily from SID try via upcall */ + } + +try_upcall_to_get_id: sidstr = sid_to_key_str(psid, sidtype); if (!sidstr) return -ENOMEM; @@ -329,6 +451,7 @@ out_revert_creds: * Note that we return 0 here unconditionally. If the mapping * fails then we just fall back to using the mnt_uid/mnt_gid. */ +got_valid_id: if (sidtype == SIDOWNER) fattr->cf_uid = fuid; else diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index cca04e710421..15261ba464c5 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -64,15 +64,15 @@ unsigned int global_secflags = CIFSSEC_DEF; unsigned int sign_CIFS_PDUs = 1; static const struct super_operations cifs_super_ops; unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE; -module_param(CIFSMaxBufSize, uint, 0); +module_param(CIFSMaxBufSize, uint, 0444); MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header). " "Default: 16384 Range: 8192 to 130048"); unsigned int cifs_min_rcv = CIFS_MIN_RCV_POOL; -module_param(cifs_min_rcv, uint, 0); +module_param(cifs_min_rcv, uint, 0444); MODULE_PARM_DESC(cifs_min_rcv, "Network buffers in pool. Default: 4 Range: " "1 to 64"); unsigned int cifs_min_small = 30; -module_param(cifs_min_small, uint, 0); +module_param(cifs_min_small, uint, 0444); MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 " "Range: 2 to 256"); unsigned int cifs_max_pending = CIFS_MAX_REQ; @@ -271,7 +271,7 @@ cifs_alloc_inode(struct super_block *sb) cifs_inode->createtime = 0; cifs_inode->epoch = 0; #ifdef CONFIG_CIFS_SMB2 - get_random_bytes(cifs_inode->lease_key, SMB2_LEASE_KEY_SIZE); + generate_random_uuid(cifs_inode->lease_key); #endif /* * Can not set i_flags here - they get immediately overwritten to zero @@ -469,6 +469,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_puts(s, ",posixpaths"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) seq_puts(s, ",setuids"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL) + seq_puts(s, ",idsfromsid"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) seq_puts(s, ",serverino"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) @@ -1262,7 +1264,6 @@ init_cifs(void) GlobalTotalActiveXid = 0; GlobalMaxActiveXid = 0; spin_lock_init(&cifs_tcp_ses_lock); - spin_lock_init(&cifs_file_list_lock); spin_lock_init(&GlobalMid_Lock); get_random_bytes(&cifs_lock_secret, sizeof(cifs_lock_secret)); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 8f1d8c1e72be..1f17f6bd7a60 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -75,6 +75,18 @@ #define SMB_ECHO_INTERVAL_MAX 600 #define SMB_ECHO_INTERVAL_DEFAULT 60 +/* + * Default number of credits to keep available for SMB3. + * This value is chosen somewhat arbitrarily. The Windows client + * defaults to 128 credits, the Windows server allows clients up to + * 512 credits (or 8K for later versions), and the NetApp server + * does not limit clients at all. Choose a high enough default value + * such that the client shouldn't limit performance, but allow mount + * to override (until you approach 64K, where we limit credits to 65000 + * to reduce possibility of seeing more server credit overflow bugs. + */ +#define SMB2_MAX_CREDITS_AVAILABLE 32000 + #include "cifspdu.h" #ifndef XATTR_DOS_ATTRIB @@ -376,6 +388,8 @@ struct smb_version_operations { int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *); int (*set_integrity)(const unsigned int, struct cifs_tcon *tcon, struct cifsFileInfo *src_file); + int (*enum_snapshots)(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *src_file, void __user *); int (*query_mf_symlink)(unsigned int, struct cifs_tcon *, struct cifs_sb_info *, const unsigned char *, char *, unsigned int *); @@ -464,6 +478,7 @@ struct smb_vol { bool retry:1; bool intr:1; bool setuids:1; + bool setuidfromacl:1; bool override_uid:1; bool override_gid:1; bool dynperm:1; @@ -510,6 +525,7 @@ struct smb_vol { struct sockaddr_storage srcaddr; /* allow binding to a local IP */ struct nls_table *local_nls; unsigned int echo_interval; /* echo interval in secs */ + unsigned int max_credits; /* smb3 max_credits 10 < credits < 60000 */ }; #define CIFS_MOUNT_MASK (CIFS_MOUNT_NO_PERM | CIFS_MOUNT_SET_UID | \ @@ -567,7 +583,8 @@ struct TCP_Server_Info { bool noblocksnd; /* use blocking sendmsg */ bool noautotune; /* do not autotune send buf sizes */ bool tcp_nodelay; - int credits; /* send no more requests at once */ + unsigned int credits; /* send no more requests at once */ + unsigned int max_credits; /* can override large 32000 default at mnt */ unsigned int in_flight; /* number of requests on the wire to server */ spinlock_t req_lock; /* protect the two values above */ struct mutex srv_mutex; @@ -833,6 +850,7 @@ struct cifs_tcon { struct list_head tcon_list; int tc_count; struct list_head openFileList; + spinlock_t open_file_lock; /* protects list above */ struct cifs_ses *ses; /* pointer to session associated with */ char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */ char *nativeFileSystem; @@ -889,7 +907,7 @@ struct cifs_tcon { #endif /* CONFIG_CIFS_STATS2 */ __u64 bytes_read; __u64 bytes_written; - spinlock_t stat_lock; + spinlock_t stat_lock; /* protects the two fields above */ #endif /* CONFIG_CIFS_STATS */ FILE_SYSTEM_DEVICE_INFO fsDevInfo; FILE_SYSTEM_ATTRIBUTE_INFO fsAttrInfo; /* ok if fs name truncated */ @@ -1040,20 +1058,24 @@ struct cifs_fid_locks { }; struct cifsFileInfo { + /* following two lists are protected by tcon->open_file_lock */ struct list_head tlist; /* pointer to next fid owned by tcon */ struct list_head flist; /* next fid (file instance) for this inode */ + /* lock list below protected by cifsi->lock_sem */ struct cifs_fid_locks *llist; /* brlocks held by this fid */ kuid_t uid; /* allows finding which FileInfo structure */ __u32 pid; /* process id who opened file */ struct cifs_fid fid; /* file id from remote */ + struct list_head rlist; /* reconnect list */ /* BB add lock scope info here if needed */ ; /* lock scope id (0 if none) */ struct dentry *dentry; - unsigned int f_flags; struct tcon_link *tlink; + unsigned int f_flags; bool invalidHandle:1; /* file closed via session abend */ bool oplock_break_cancelled:1; - int count; /* refcount protected by cifs_file_list_lock */ + int count; + spinlock_t file_info_lock; /* protects four flag/count fields above */ struct mutex fh_mutex; /* prevents reopen race after dead ses*/ struct cifs_search_info srch_inf; struct work_struct oplock_break; /* work for oplock breaks */ @@ -1120,7 +1142,7 @@ struct cifs_writedata { /* * Take a reference on the file private data. Must be called with - * cifs_file_list_lock held. + * cfile->file_info_lock held. */ static inline void cifsFileInfo_get_locked(struct cifsFileInfo *cifs_file) @@ -1514,8 +1536,10 @@ require use of the stronger protocol */ * GlobalMid_Lock protects: * list operations on pending_mid_q and oplockQ * updates to XID counters, multiplex id and SMB sequence numbers - * cifs_file_list_lock protects: - * list operations on tcp and SMB session lists and tCon lists + * tcp_ses_lock protects: + * list operations on tcp and SMB session lists + * tcon->open_file_lock protects the list of open files hanging off the tcon + * cfile->file_info_lock protects counters and fields in cifs file struct * f_owner.lock protects certain per file struct operations * mapping->page_lock protects certain per page operations * @@ -1547,18 +1571,12 @@ GLOBAL_EXTERN struct list_head cifs_tcp_ses_list; * tcp session, and the list of tcon's per smb session. It also protects * the reference counters for the server, smb session, and tcon. Finally, * changes to the tcon->tidStatus should be done while holding this lock. + * generally the locks should be taken in order tcp_ses_lock before + * tcon->open_file_lock and that before file->file_info_lock since the + * structure order is cifs_socket-->cifs_ses-->cifs_tcon-->cifs_file */ GLOBAL_EXTERN spinlock_t cifs_tcp_ses_lock; -/* - * This lock protects the cifs_file->llist and cifs_file->flist - * list operations, and updates to some flags (cifs_file->invalidHandle) - * It will be moved to either use the tcon->stat_lock or equivalent later. - * If cifs_tcp_ses_lock and the lock below are both needed to be held, then - * the cifs_tcp_ses_lock must be grabbed first and released last. - */ -GLOBAL_EXTERN spinlock_t cifs_file_list_lock; - #ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */ /* Outstanding dir notify requests */ GLOBAL_EXTERN struct list_head GlobalDnotifyReqList; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 4ead72a001f9..ced0e42ce460 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -193,6 +193,8 @@ extern struct smb_vol *cifs_get_volume_info(char *mount_data, extern int cifs_mount(struct cifs_sb_info *, struct smb_vol *); extern void cifs_umount(struct cifs_sb_info *); extern void cifs_mark_open_files_invalid(struct cifs_tcon *tcon); +extern void cifs_reopen_persistent_handles(struct cifs_tcon *tcon); + extern bool cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length, __u8 type, struct cifsLockInfo **conf_lock, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index f82d2823622f..3f3185febc58 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -98,13 +98,13 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) struct list_head *tmp1; /* list all files open on tree connection and mark them invalid */ - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); list_for_each_safe(tmp, tmp1, &tcon->openFileList) { open_file = list_entry(tmp, struct cifsFileInfo, tlist); open_file->invalidHandle = true; open_file->oplock_break_cancelled = true; } - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); /* * BB Add call to invalidate_inodes(sb) for all superblocks mounted * to this tcon. diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 2e4f4bad8b1e..aab5227979e2 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -63,7 +63,6 @@ extern mempool_t *cifs_req_poolp; #define TLINK_IDLE_EXPIRE (600 * HZ) enum { - /* Mount options that take no arguments */ Opt_user_xattr, Opt_nouser_xattr, Opt_forceuid, Opt_noforceuid, @@ -76,7 +75,7 @@ enum { Opt_noposixpaths, Opt_nounix, Opt_nocase, Opt_brl, Opt_nobrl, - Opt_forcemandatorylock, Opt_setuids, + Opt_forcemandatorylock, Opt_setuidfromacl, Opt_setuids, Opt_nosetuids, Opt_dynperm, Opt_nodynperm, Opt_nohard, Opt_nosoft, Opt_nointr, Opt_intr, @@ -95,7 +94,7 @@ enum { Opt_cruid, Opt_gid, Opt_file_mode, Opt_dirmode, Opt_port, Opt_rsize, Opt_wsize, Opt_actimeo, - Opt_echo_interval, + Opt_echo_interval, Opt_max_credits, /* Mount options which take string value */ Opt_user, Opt_pass, Opt_ip, @@ -148,6 +147,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_forcemandatorylock, "forcemand" }, { Opt_setuids, "setuids" }, { Opt_nosetuids, "nosetuids" }, + { Opt_setuidfromacl, "idsfromsid" }, { Opt_dynperm, "dynperm" }, { Opt_nodynperm, "nodynperm" }, { Opt_nohard, "nohard" }, @@ -190,6 +190,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_wsize, "wsize=%s" }, { Opt_actimeo, "actimeo=%s" }, { Opt_echo_interval, "echo_interval=%s" }, + { Opt_max_credits, "max_credits=%s" }, { Opt_blank_user, "user=" }, { Opt_blank_user, "username=" }, @@ -1376,6 +1377,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, case Opt_nosetuids: vol->setuids = 0; break; + case Opt_setuidfromacl: + vol->setuidfromacl = 1; + break; case Opt_dynperm: vol->dynperm = true; break; @@ -1586,6 +1590,15 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, } vol->echo_interval = option; break; + case Opt_max_credits: + if (get_option_ul(args, &option) || (option < 20) || + (option > 60000)) { + cifs_dbg(VFS, "%s: Invalid max_credits value\n", + __func__); + goto cifs_parse_mount_err; + } + vol->max_credits = option; + break; /* String Arguments */ @@ -2163,7 +2176,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) memcpy(&tcp_ses->dstaddr, &volume_info->dstaddr, sizeof(tcp_ses->dstaddr)); #ifdef CONFIG_CIFS_SMB2 - get_random_bytes(tcp_ses->client_guid, SMB2_CLIENT_GUID_SIZE); + generate_random_uuid(tcp_ses->client_guid); #endif /* * at this point we are the only ones with the pointer @@ -3270,6 +3283,8 @@ int cifs_setup_cifs_sb(struct smb_vol *pvolume_info, cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM; if (pvolume_info->setuids) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SET_UID; + if (pvolume_info->setuidfromacl) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UID_FROM_ACL; if (pvolume_info->server_ino) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SERVER_INUM; if (pvolume_info->remap) @@ -3598,7 +3613,11 @@ try_mount_again: bdi_destroy(&cifs_sb->bdi); goto out; } - + if ((volume_info->max_credits < 20) || + (volume_info->max_credits > 60000)) + server->max_credits = SMB2_MAX_CREDITS_AVAILABLE; + else + server->max_credits = volume_info->max_credits; /* get a reference to a SMB session */ ses = cifs_get_smb_ses(server, volume_info); if (IS_ERR(ses)) { @@ -3688,14 +3707,16 @@ remote_path_check: goto mount_fail_check; } - rc = cifs_are_all_path_components_accessible(server, + if (rc != -EREMOTE) { + rc = cifs_are_all_path_components_accessible(server, xid, tcon, cifs_sb, full_path); - if (rc != 0) { - cifs_dbg(VFS, "cannot query dirs between root and final path, " - "enabling CIFS_MOUNT_USE_PREFIX_PATH\n"); - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; - rc = 0; + if (rc != 0) { + cifs_dbg(VFS, "cannot query dirs between root and final path, " + "enabling CIFS_MOUNT_USE_PREFIX_PATH\n"); + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; + rc = 0; + } } kfree(full_path); } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index a95fe8b1afe9..7f5f6176c6f1 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -305,6 +305,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, cfile->tlink = cifs_get_tlink(tlink); INIT_WORK(&cfile->oplock_break, cifs_oplock_break); mutex_init(&cfile->fh_mutex); + spin_lock_init(&cfile->file_info_lock); cifs_sb_active(inode->i_sb); @@ -317,7 +318,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, oplock = 0; } - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); if (fid->pending_open->oplock != CIFS_OPLOCK_NO_CHANGE && oplock) oplock = fid->pending_open->oplock; list_del(&fid->pending_open->olist); @@ -326,12 +327,13 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, server->ops->set_fid(cfile, fid, oplock); list_add(&cfile->tlist, &tcon->openFileList); + /* if readable file instance put first in list*/ if (file->f_mode & FMODE_READ) list_add(&cfile->flist, &cinode->openFileList); else list_add_tail(&cfile->flist, &cinode->openFileList); - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); if (fid->purge_cache) cifs_zap_mapping(inode); @@ -343,16 +345,16 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, struct cifsFileInfo * cifsFileInfo_get(struct cifsFileInfo *cifs_file) { - spin_lock(&cifs_file_list_lock); + spin_lock(&cifs_file->file_info_lock); cifsFileInfo_get_locked(cifs_file); - spin_unlock(&cifs_file_list_lock); + spin_unlock(&cifs_file->file_info_lock); return cifs_file; } /* * Release a reference on the file private data. This may involve closing * the filehandle out on the server. Must be called without holding - * cifs_file_list_lock. + * tcon->open_file_lock and cifs_file->file_info_lock. */ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) { @@ -367,11 +369,15 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) struct cifs_pending_open open; bool oplock_break_cancelled; - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); + + spin_lock(&cifs_file->file_info_lock); if (--cifs_file->count > 0) { - spin_unlock(&cifs_file_list_lock); + spin_unlock(&cifs_file->file_info_lock); + spin_unlock(&tcon->open_file_lock); return; } + spin_unlock(&cifs_file->file_info_lock); if (server->ops->get_lease_key) server->ops->get_lease_key(inode, &fid); @@ -395,7 +401,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) set_bit(CIFS_INO_INVALID_MAPPING, &cifsi->flags); cifs_set_oplock_level(cifsi, 0); } - spin_unlock(&cifs_file_list_lock); + + spin_unlock(&tcon->open_file_lock); oplock_break_cancelled = cancel_work_sync(&cifs_file->oplock_break); @@ -732,6 +739,15 @@ reopen_success: * to the server to get the new inode info. */ + /* + * If the server returned a read oplock and we have mandatory brlocks, + * set oplock level to None. + */ + if (server->ops->is_read_op(oplock) && cifs_has_mand_locks(cinode)) { + cifs_dbg(FYI, "Reset oplock val from read to None due to mand locks\n"); + oplock = 0; + } + server->ops->set_fid(cfile, &cfile->fid, oplock); if (oparms.reconnect) cifs_relock_file(cfile); @@ -753,6 +769,36 @@ int cifs_close(struct inode *inode, struct file *file) return 0; } +void +cifs_reopen_persistent_handles(struct cifs_tcon *tcon) +{ + struct cifsFileInfo *open_file; + struct list_head *tmp; + struct list_head *tmp1; + struct list_head tmp_list; + + cifs_dbg(FYI, "Reopen persistent handles"); + INIT_LIST_HEAD(&tmp_list); + + /* list all files open on tree connection, reopen resilient handles */ + spin_lock(&tcon->open_file_lock); + list_for_each(tmp, &tcon->openFileList) { + open_file = list_entry(tmp, struct cifsFileInfo, tlist); + if (!open_file->invalidHandle) + continue; + cifsFileInfo_get(open_file); + list_add_tail(&open_file->rlist, &tmp_list); + } + spin_unlock(&tcon->open_file_lock); + + list_for_each_safe(tmp, tmp1, &tmp_list) { + open_file = list_entry(tmp, struct cifsFileInfo, rlist); + cifs_reopen_file(open_file, false /* do not flush */); + list_del_init(&open_file->rlist); + cifsFileInfo_put(open_file); + } +} + int cifs_closedir(struct inode *inode, struct file *file) { int rc = 0; @@ -772,10 +818,10 @@ int cifs_closedir(struct inode *inode, struct file *file) server = tcon->ses->server; cifs_dbg(FYI, "Freeing private data in close dir\n"); - spin_lock(&cifs_file_list_lock); + spin_lock(&cfile->file_info_lock); if (server->ops->dir_needs_close(cfile)) { cfile->invalidHandle = true; - spin_unlock(&cifs_file_list_lock); + spin_unlock(&cfile->file_info_lock); if (server->ops->close_dir) rc = server->ops->close_dir(xid, tcon, &cfile->fid); else @@ -784,7 +830,7 @@ int cifs_closedir(struct inode *inode, struct file *file) /* not much we can do if it fails anyway, ignore rc */ rc = 0; } else - spin_unlock(&cifs_file_list_lock); + spin_unlock(&cfile->file_info_lock); buf = cfile->srch_inf.ntwrk_buf_start; if (buf) { @@ -1728,12 +1774,13 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, { struct cifsFileInfo *open_file = NULL; struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); /* only filter by fsuid on multiuser mounts */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) fsuid_only = false; - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); /* we could simply get the first_list_entry since write-only entries are always at the end of the list but since the first entry might have a close pending, we go through the whole list */ @@ -1744,8 +1791,8 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, if (!open_file->invalidHandle) { /* found a good file */ /* lock it so it will not be closed on us */ - cifsFileInfo_get_locked(open_file); - spin_unlock(&cifs_file_list_lock); + cifsFileInfo_get(open_file); + spin_unlock(&tcon->open_file_lock); return open_file; } /* else might as well continue, and look for another, or simply have the caller reopen it @@ -1753,7 +1800,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, } else /* write only file */ break; /* write only files are last so must be done */ } - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); return NULL; } @@ -1762,6 +1809,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, { struct cifsFileInfo *open_file, *inv_file = NULL; struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; bool any_available = false; int rc; unsigned int refind = 0; @@ -1777,15 +1825,16 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, } cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); + tcon = cifs_sb_master_tcon(cifs_sb); /* only filter by fsuid on multiuser mounts */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) fsuid_only = false; - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); refind_writable: if (refind > MAX_REOPEN_ATT) { - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); return NULL; } list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { @@ -1796,8 +1845,8 @@ refind_writable: if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) { if (!open_file->invalidHandle) { /* found a good writable file */ - cifsFileInfo_get_locked(open_file); - spin_unlock(&cifs_file_list_lock); + cifsFileInfo_get(open_file); + spin_unlock(&tcon->open_file_lock); return open_file; } else { if (!inv_file) @@ -1813,24 +1862,24 @@ refind_writable: if (inv_file) { any_available = false; - cifsFileInfo_get_locked(inv_file); + cifsFileInfo_get(inv_file); } - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); if (inv_file) { rc = cifs_reopen_file(inv_file, false); if (!rc) return inv_file; else { - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); list_move_tail(&inv_file->flist, &cifs_inode->openFileList); - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); cifsFileInfo_put(inv_file); - spin_lock(&cifs_file_list_lock); ++refind; inv_file = NULL; + spin_lock(&tcon->open_file_lock); goto refind_writable; } } @@ -3612,15 +3661,17 @@ static int cifs_readpage(struct file *file, struct page *page) static int is_inode_writable(struct cifsInodeInfo *cifs_inode) { struct cifsFileInfo *open_file; + struct cifs_tcon *tcon = + cifs_sb_master_tcon(CIFS_SB(cifs_inode->vfs_inode.i_sb)); - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) { - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); return 1; } } - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); return 0; } diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 7a3b84e300f8..9f51b81119f2 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -189,7 +189,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) xid = get_xid(); cifs_sb = CIFS_SB(inode->i_sb); - + cifs_dbg(VFS, "cifs ioctl 0x%x\n", command); switch (command) { case FS_IOC_GETFLAGS: if (pSMBFile == NULL) @@ -267,11 +267,23 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) tcon = tlink_tcon(pSMBFile->tlink); rc = smb_mnt_get_fsinfo(xid, tcon, (void __user *)arg); break; + case CIFS_ENUMERATE_SNAPSHOTS: + if (arg == 0) { + rc = -EINVAL; + goto cifs_ioc_exit; + } + tcon = tlink_tcon(pSMBFile->tlink); + if (tcon->ses->server->ops->enum_snapshots) + rc = tcon->ses->server->ops->enum_snapshots(xid, tcon, + pSMBFile, (void __user *)arg); + else + rc = -EOPNOTSUPP; + break; default: cifs_dbg(FYI, "unsupported ioctl\n"); break; } - +cifs_ioc_exit: free_xid(xid); return rc; } diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 813fe13c2ae1..c6729156f9a0 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -120,6 +120,7 @@ tconInfoAlloc(void) ++ret_buf->tc_count; INIT_LIST_HEAD(&ret_buf->openFileList); INIT_LIST_HEAD(&ret_buf->tcon_list); + spin_lock_init(&ret_buf->open_file_lock); #ifdef CONFIG_CIFS_STATS spin_lock_init(&ret_buf->stat_lock); #endif @@ -465,7 +466,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) continue; cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks); - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); list_for_each(tmp2, &tcon->openFileList) { netfile = list_entry(tmp2, struct cifsFileInfo, tlist); @@ -495,11 +496,11 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) &netfile->oplock_break); netfile->oplock_break_cancelled = false; - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); return true; } - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); cifs_dbg(FYI, "No matching file for oplock break\n"); return true; @@ -613,9 +614,9 @@ backup_cred(struct cifs_sb_info *cifs_sb) void cifs_del_pending_open(struct cifs_pending_open *open) { - spin_lock(&cifs_file_list_lock); + spin_lock(&tlink_tcon(open->tlink)->open_file_lock); list_del(&open->olist); - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tlink_tcon(open->tlink)->open_file_lock); } void @@ -635,7 +636,7 @@ void cifs_add_pending_open(struct cifs_fid *fid, struct tcon_link *tlink, struct cifs_pending_open *open) { - spin_lock(&cifs_file_list_lock); + spin_lock(&tlink_tcon(tlink)->open_file_lock); cifs_add_pending_open_locked(fid, tlink, open); - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tlink_tcon(open->tlink)->open_file_lock); } diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 65cf85dcda09..8f6a2a5863b9 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -597,14 +597,14 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, is_dir_changed(file)) || (index_to_find < first_entry_in_buffer)) { /* close and restart search */ cifs_dbg(FYI, "search backing up - close and restart search\n"); - spin_lock(&cifs_file_list_lock); + spin_lock(&cfile->file_info_lock); if (server->ops->dir_needs_close(cfile)) { cfile->invalidHandle = true; - spin_unlock(&cifs_file_list_lock); + spin_unlock(&cfile->file_info_lock); if (server->ops->close_dir) server->ops->close_dir(xid, tcon, &cfile->fid); } else - spin_unlock(&cifs_file_list_lock); + spin_unlock(&cfile->file_info_lock); if (cfile->srch_inf.ntwrk_buf_start) { cifs_dbg(FYI, "freeing SMB ff cache buf on search rewind\n"); if (cfile->srch_inf.smallBuf) diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 4f0231e685a9..1238cd3552f9 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -266,9 +266,15 @@ smb2_set_file_info(struct inode *inode, const char *full_path, struct tcon_link *tlink; int rc; + if ((buf->CreationTime == 0) && (buf->LastAccessTime == 0) && + (buf->LastWriteTime == 0) && (buf->ChangeTime) && + (buf->Attributes == 0)) + return 0; /* would be a no op, no sense sending this */ + tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); + rc = smb2_open_op_close(xid, tlink_tcon(tlink), cifs_sb, full_path, FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, buf, SMB2_OP_SET_INFO); diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 389fb9f8c84e..3d383489b9cf 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -549,19 +549,19 @@ smb2_is_valid_lease_break(char *buffer) list_for_each(tmp1, &server->smb_ses_list) { ses = list_entry(tmp1, struct cifs_ses, smb_ses_list); - spin_lock(&cifs_file_list_lock); list_for_each(tmp2, &ses->tcon_list) { tcon = list_entry(tmp2, struct cifs_tcon, tcon_list); + spin_lock(&tcon->open_file_lock); cifs_stats_inc( &tcon->stats.cifs_stats.num_oplock_brks); if (smb2_tcon_has_lease(tcon, rsp, lw)) { - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); return true; } + spin_unlock(&tcon->open_file_lock); } - spin_unlock(&cifs_file_list_lock); } } spin_unlock(&cifs_tcp_ses_lock); @@ -603,7 +603,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) tcon = list_entry(tmp1, struct cifs_tcon, tcon_list); cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks); - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); list_for_each(tmp2, &tcon->openFileList) { cfile = list_entry(tmp2, struct cifsFileInfo, tlist); @@ -615,7 +615,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) cifs_dbg(FYI, "file id match, oplock break\n"); cinode = CIFS_I(d_inode(cfile->dentry)); - + spin_lock(&cfile->file_info_lock); if (!CIFS_CACHE_WRITE(cinode) && rsp->OplockLevel == SMB2_OPLOCK_LEVEL_NONE) cfile->oplock_break_cancelled = true; @@ -637,14 +637,14 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) clear_bit( CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags); - + spin_unlock(&cfile->file_info_lock); queue_work(cifsiod_wq, &cfile->oplock_break); - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); return true; } - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); cifs_dbg(FYI, "No matching file for oplock break\n"); return true; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index d203c0329626..5d456ebb3813 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -28,6 +28,7 @@ #include "cifs_unicode.h" #include "smb2status.h" #include "smb2glob.h" +#include "cifs_ioctl.h" static int change_conf(struct TCP_Server_Info *server) @@ -70,6 +71,10 @@ smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add, spin_lock(&server->req_lock); val = server->ops->get_credits_field(server, optype); *val += add; + if (*val > 65000) { + *val = 65000; /* Don't get near 64K credits, avoid srv bugs */ + printk_once(KERN_WARNING "server overflowed SMB3 credits\n"); + } server->in_flight--; if (server->in_flight == 0 && (optype & CIFS_OP_MASK) != CIFS_NEG_OP) rc = change_conf(server); @@ -287,7 +292,7 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) cifs_dbg(FYI, "Link Speed %lld\n", le64_to_cpu(out_buf->LinkSpeed)); } - + kfree(out_buf); return rc; } #endif /* STATS2 */ @@ -541,6 +546,7 @@ smb2_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock) server->ops->set_oplock_level(cinode, oplock, fid->epoch, &fid->purge_cache); cinode->can_cache_brlcks = CIFS_CACHE_WRITE(cinode); + memcpy(cfile->fid.create_guid, fid->create_guid, 16); } static void @@ -699,6 +705,7 @@ smb2_clone_range(const unsigned int xid, cchunk_out: kfree(pcchunk); + kfree(retbuf); return rc; } @@ -823,7 +830,6 @@ smb2_duplicate_extents(const unsigned int xid, { int rc; unsigned int ret_data_len; - char *retbuf = NULL; struct duplicate_extents_to_file dup_ext_buf; struct cifs_tcon *tcon = tlink_tcon(trgtfile->tlink); @@ -849,7 +855,7 @@ smb2_duplicate_extents(const unsigned int xid, FSCTL_DUPLICATE_EXTENTS_TO_FILE, true /* is_fsctl */, (char *)&dup_ext_buf, sizeof(struct duplicate_extents_to_file), - (char **)&retbuf, + NULL, &ret_data_len); if (ret_data_len > 0) @@ -872,7 +878,6 @@ smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon, struct cifsFileInfo *cfile) { struct fsctl_set_integrity_information_req integr_info; - char *retbuf = NULL; unsigned int ret_data_len; integr_info.ChecksumAlgorithm = cpu_to_le16(CHECKSUM_TYPE_UNCHANGED); @@ -884,9 +889,53 @@ smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon, FSCTL_SET_INTEGRITY_INFORMATION, true /* is_fsctl */, (char *)&integr_info, sizeof(struct fsctl_set_integrity_information_req), + NULL, + &ret_data_len); + +} + +static int +smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile, void __user *ioc_buf) +{ + char *retbuf = NULL; + unsigned int ret_data_len = 0; + int rc; + struct smb_snapshot_array snapshot_in; + + rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + FSCTL_SRV_ENUMERATE_SNAPSHOTS, + true /* is_fsctl */, NULL, 0 /* no input data */, (char **)&retbuf, &ret_data_len); + cifs_dbg(FYI, "enum snaphots ioctl returned %d and ret buflen is %d\n", + rc, ret_data_len); + if (rc) + return rc; + if (ret_data_len && (ioc_buf != NULL) && (retbuf != NULL)) { + /* Fixup buffer */ + if (copy_from_user(&snapshot_in, ioc_buf, + sizeof(struct smb_snapshot_array))) { + rc = -EFAULT; + kfree(retbuf); + return rc; + } + if (snapshot_in.snapshot_array_size < sizeof(struct smb_snapshot_array)) { + rc = -ERANGE; + return rc; + } + + if (ret_data_len > snapshot_in.snapshot_array_size) + ret_data_len = snapshot_in.snapshot_array_size; + + if (copy_to_user(ioc_buf, retbuf, ret_data_len)) + rc = -EFAULT; + } + + kfree(retbuf); + return rc; } static int @@ -1041,7 +1090,7 @@ smb2_set_lease_key(struct inode *inode, struct cifs_fid *fid) static void smb2_new_lease_key(struct cifs_fid *fid) { - get_random_bytes(fid->lease_key, SMB2_LEASE_KEY_SIZE); + generate_random_uuid(fid->lease_key); } #define SMB2_SYMLINK_STRUCT_SIZE \ @@ -1654,6 +1703,7 @@ struct smb_version_operations smb21_operations = { .clone_range = smb2_clone_range, .wp_retry_size = smb2_wp_retry_size, .dir_needs_close = smb2_dir_needs_close, + .enum_snapshots = smb3_enum_snapshots, }; struct smb_version_operations smb30_operations = { @@ -1740,6 +1790,7 @@ struct smb_version_operations smb30_operations = { .wp_retry_size = smb2_wp_retry_size, .dir_needs_close = smb2_dir_needs_close, .fallocate = smb3_fallocate, + .enum_snapshots = smb3_enum_snapshots, }; #ifdef CONFIG_CIFS_SMB311 @@ -1827,6 +1878,7 @@ struct smb_version_operations smb311_operations = { .wp_retry_size = smb2_wp_retry_size, .dir_needs_close = smb2_dir_needs_close, .fallocate = smb3_fallocate, + .enum_snapshots = smb3_enum_snapshots, }; #endif /* CIFS_SMB311 */ diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 29e06db5f187..5ca5ea4668a1 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -100,7 +100,21 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ , hdr->ProtocolId = SMB2_PROTO_NUMBER; hdr->StructureSize = cpu_to_le16(64); hdr->Command = smb2_cmd; - hdr->CreditRequest = cpu_to_le16(2); /* BB make this dynamic */ + if (tcon && tcon->ses && tcon->ses->server) { + struct TCP_Server_Info *server = tcon->ses->server; + + spin_lock(&server->req_lock); + /* Request up to 2 credits but don't go over the limit. */ + if (server->credits >= server->max_credits) + hdr->CreditRequest = cpu_to_le16(0); + else + hdr->CreditRequest = cpu_to_le16( + min_t(int, server->max_credits - + server->credits, 2)); + spin_unlock(&server->req_lock); + } else { + hdr->CreditRequest = cpu_to_le16(2); + } hdr->ProcessId = cpu_to_le32((__u16)current->tgid); if (!tcon) @@ -236,8 +250,13 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) } cifs_mark_open_files_invalid(tcon); + rc = SMB2_tcon(0, tcon->ses, tcon->treeName, tcon, nls_codepage); mutex_unlock(&tcon->ses->session_mutex); + + if (tcon->use_persistent) + cifs_reopen_persistent_handles(tcon); + cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc); if (rc) goto out; @@ -574,59 +593,42 @@ vneg_out: return -EIO; } -int -SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, - const struct nls_table *nls_cp) +struct SMB2_sess_data { + unsigned int xid; + struct cifs_ses *ses; + struct nls_table *nls_cp; + void (*func)(struct SMB2_sess_data *); + int result; + u64 previous_session; + + /* we will send the SMB in three pieces: + * a fixed length beginning part, an optional + * SPNEGO blob (which can be zero length), and a + * last part which will include the strings + * and rest of bcc area. This allows us to avoid + * a large buffer 17K allocation + */ + int buf0_type; + struct kvec iov[2]; +}; + +static int +SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) { + int rc; + struct cifs_ses *ses = sess_data->ses; struct smb2_sess_setup_req *req; - struct smb2_sess_setup_rsp *rsp = NULL; - struct kvec iov[2]; - int rc = 0; - int resp_buftype = CIFS_NO_BUFFER; - __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ struct TCP_Server_Info *server = ses->server; - u16 blob_length = 0; - struct key *spnego_key = NULL; - char *security_blob = NULL; - unsigned char *ntlmssp_blob = NULL; - bool use_spnego = false; /* else use raw ntlmssp */ - - cifs_dbg(FYI, "Session Setup\n"); - - if (!server) { - WARN(1, "%s: server is NULL!\n", __func__); - return -EIO; - } - - /* - * If we are here due to reconnect, free per-smb session key - * in case signing was required. - */ - kfree(ses->auth_key.response); - ses->auth_key.response = NULL; - - /* - * If memory allocation is successful, caller of this function - * frees it. - */ - ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL); - if (!ses->ntlmssp) - return -ENOMEM; - ses->ntlmssp->sesskey_per_smbsess = true; - - /* FIXME: allow for other auth types besides NTLMSSP (e.g. krb5) */ - if (ses->sectype != Kerberos && ses->sectype != RawNTLMSSP) - ses->sectype = RawNTLMSSP; - -ssetup_ntlmssp_authenticate: - if (phase == NtLmChallenge) - phase = NtLmAuthenticate; /* if ntlmssp, now final phase */ rc = small_smb2_init(SMB2_SESSION_SETUP, NULL, (void **) &req); if (rc) return rc; req->hdr.SessionId = 0; /* First session, not a reauthenticate */ + + /* if reconnect, we need to send previous sess id, otherwise it is 0 */ + req->PreviousSessionId = sess_data->previous_session; + req->Flags = 0; /* MBZ */ /* to enable echos and oplocks */ req->hdr.CreditRequest = cpu_to_le16(3); @@ -642,199 +644,368 @@ ssetup_ntlmssp_authenticate: req->Capabilities = 0; req->Channel = 0; /* MBZ */ - iov[0].iov_base = (char *)req; + sess_data->iov[0].iov_base = (char *)req; /* 4 for rfc1002 length field and 1 for pad */ - iov[0].iov_len = get_rfc1002_length(req) + 4 - 1; + sess_data->iov[0].iov_len = get_rfc1002_length(req) + 4 - 1; + /* + * This variable will be used to clear the buffer + * allocated above in case of any error in the calling function. + */ + sess_data->buf0_type = CIFS_SMALL_BUFFER; - if (ses->sectype == Kerberos) { -#ifdef CONFIG_CIFS_UPCALL - struct cifs_spnego_msg *msg; + return 0; +} - spnego_key = cifs_get_spnego_key(ses); - if (IS_ERR(spnego_key)) { - rc = PTR_ERR(spnego_key); - spnego_key = NULL; - goto ssetup_exit; - } +static void +SMB2_sess_free_buffer(struct SMB2_sess_data *sess_data) +{ + free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base); + sess_data->buf0_type = CIFS_NO_BUFFER; +} - msg = spnego_key->payload.data[0]; - /* - * check version field to make sure that cifs.upcall is - * sending us a response in an expected form - */ - if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) { - cifs_dbg(VFS, - "bad cifs.upcall version. Expected %d got %d", - CIFS_SPNEGO_UPCALL_VERSION, msg->version); - rc = -EKEYREJECTED; - goto ssetup_exit; - } - ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, - GFP_KERNEL); - if (!ses->auth_key.response) { - cifs_dbg(VFS, - "Kerberos can't allocate (%u bytes) memory", - msg->sesskey_len); - rc = -ENOMEM; - goto ssetup_exit; - } - ses->auth_key.len = msg->sesskey_len; - blob_length = msg->secblob_len; - iov[1].iov_base = msg->data + msg->sesskey_len; - iov[1].iov_len = blob_length; -#else - rc = -EOPNOTSUPP; - goto ssetup_exit; -#endif /* CONFIG_CIFS_UPCALL */ - } else if (phase == NtLmNegotiate) { /* if not krb5 must be ntlmssp */ - ntlmssp_blob = kmalloc(sizeof(struct _NEGOTIATE_MESSAGE), - GFP_KERNEL); - if (ntlmssp_blob == NULL) { - rc = -ENOMEM; - goto ssetup_exit; - } - build_ntlmssp_negotiate_blob(ntlmssp_blob, ses); - if (use_spnego) { - /* blob_length = build_spnego_ntlmssp_blob( - &security_blob, - sizeof(struct _NEGOTIATE_MESSAGE), - ntlmssp_blob); */ - /* BB eventually need to add this */ - cifs_dbg(VFS, "spnego not supported for SMB2 yet\n"); - rc = -EOPNOTSUPP; - kfree(ntlmssp_blob); - goto ssetup_exit; - } else { - blob_length = sizeof(struct _NEGOTIATE_MESSAGE); - /* with raw NTLMSSP we don't encapsulate in SPNEGO */ - security_blob = ntlmssp_blob; - } - iov[1].iov_base = security_blob; - iov[1].iov_len = blob_length; - } else if (phase == NtLmAuthenticate) { - req->hdr.SessionId = ses->Suid; - rc = build_ntlmssp_auth_blob(&ntlmssp_blob, &blob_length, ses, - nls_cp); - if (rc) { - cifs_dbg(FYI, "build_ntlmssp_auth_blob failed %d\n", - rc); - goto ssetup_exit; /* BB double check error handling */ - } - if (use_spnego) { - /* blob_length = build_spnego_ntlmssp_blob( - &security_blob, - blob_length, - ntlmssp_blob); */ - cifs_dbg(VFS, "spnego not supported for SMB2 yet\n"); - rc = -EOPNOTSUPP; - kfree(ntlmssp_blob); - goto ssetup_exit; - } else { - security_blob = ntlmssp_blob; - } - iov[1].iov_base = security_blob; - iov[1].iov_len = blob_length; - } else { - cifs_dbg(VFS, "illegal ntlmssp phase\n"); - rc = -EIO; - goto ssetup_exit; - } +static int +SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data) +{ + int rc; + struct smb2_sess_setup_req *req = sess_data->iov[0].iov_base; /* Testing shows that buffer offset must be at location of Buffer[0] */ req->SecurityBufferOffset = - cpu_to_le16(sizeof(struct smb2_sess_setup_req) - - 1 /* pad */ - 4 /* rfc1001 len */); - req->SecurityBufferLength = cpu_to_le16(blob_length); + cpu_to_le16(sizeof(struct smb2_sess_setup_req) - + 1 /* pad */ - 4 /* rfc1001 len */); + req->SecurityBufferLength = cpu_to_le16(sess_data->iov[1].iov_len); - inc_rfc1001_len(req, blob_length - 1 /* pad */); + inc_rfc1001_len(req, sess_data->iov[1].iov_len - 1 /* pad */); /* BB add code to build os and lm fields */ - rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, - CIFS_LOG_ERROR | CIFS_NEG_OP); + rc = SendReceive2(sess_data->xid, sess_data->ses, + sess_data->iov, 2, + &sess_data->buf0_type, + CIFS_LOG_ERROR | CIFS_NEG_OP); - kfree(security_blob); - rsp = (struct smb2_sess_setup_rsp *)iov[0].iov_base; - ses->Suid = rsp->hdr.SessionId; - if (resp_buftype != CIFS_NO_BUFFER && - rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) { - if (phase != NtLmNegotiate) { - cifs_dbg(VFS, "Unexpected more processing error\n"); - goto ssetup_exit; - } - if (offsetof(struct smb2_sess_setup_rsp, Buffer) - 4 != - le16_to_cpu(rsp->SecurityBufferOffset)) { - cifs_dbg(VFS, "Invalid security buffer offset %d\n", - le16_to_cpu(rsp->SecurityBufferOffset)); - rc = -EIO; - goto ssetup_exit; + return rc; +} + +static int +SMB2_sess_establish_session(struct SMB2_sess_data *sess_data) +{ + int rc = 0; + struct cifs_ses *ses = sess_data->ses; + + mutex_lock(&ses->server->srv_mutex); + if (ses->server->sign && ses->server->ops->generate_signingkey) { + rc = ses->server->ops->generate_signingkey(ses); + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + if (rc) { + cifs_dbg(FYI, + "SMB3 session key generation failed\n"); + mutex_unlock(&ses->server->srv_mutex); + goto keygen_exit; } + } + if (!ses->server->session_estab) { + ses->server->sequence_number = 0x2; + ses->server->session_estab = true; + } + mutex_unlock(&ses->server->srv_mutex); + + cifs_dbg(FYI, "SMB2/3 session established successfully\n"); + spin_lock(&GlobalMid_Lock); + ses->status = CifsGood; + ses->need_reconnect = false; + spin_unlock(&GlobalMid_Lock); - /* NTLMSSP Negotiate sent now processing challenge (response) */ - phase = NtLmChallenge; /* process ntlmssp challenge */ - rc = 0; /* MORE_PROCESSING is not an error here but expected */ - rc = decode_ntlmssp_challenge(rsp->Buffer, - le16_to_cpu(rsp->SecurityBufferLength), ses); +keygen_exit: + if (!ses->server->sign) { + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + } + return rc; +} + +#ifdef CONFIG_CIFS_UPCALL +static void +SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) +{ + int rc; + struct cifs_ses *ses = sess_data->ses; + struct cifs_spnego_msg *msg; + struct key *spnego_key = NULL; + struct smb2_sess_setup_rsp *rsp = NULL; + + rc = SMB2_sess_alloc_buffer(sess_data); + if (rc) + goto out; + + spnego_key = cifs_get_spnego_key(ses); + if (IS_ERR(spnego_key)) { + rc = PTR_ERR(spnego_key); + spnego_key = NULL; + goto out; } + msg = spnego_key->payload.data[0]; /* - * BB eventually add code for SPNEGO decoding of NtlmChallenge blob, - * but at least the raw NTLMSSP case works. + * check version field to make sure that cifs.upcall is + * sending us a response in an expected form */ + if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) { + cifs_dbg(VFS, + "bad cifs.upcall version. Expected %d got %d", + CIFS_SPNEGO_UPCALL_VERSION, msg->version); + rc = -EKEYREJECTED; + goto out_put_spnego_key; + } + + ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, + GFP_KERNEL); + if (!ses->auth_key.response) { + cifs_dbg(VFS, + "Kerberos can't allocate (%u bytes) memory", + msg->sesskey_len); + rc = -ENOMEM; + goto out_put_spnego_key; + } + ses->auth_key.len = msg->sesskey_len; + + sess_data->iov[1].iov_base = msg->data + msg->sesskey_len; + sess_data->iov[1].iov_len = msg->secblob_len; + + rc = SMB2_sess_sendreceive(sess_data); + if (rc) + goto out_put_spnego_key; + + rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; + ses->Suid = rsp->hdr.SessionId; + + ses->session_flags = le16_to_cpu(rsp->SessionFlags); + if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) + cifs_dbg(VFS, "SMB3 encryption not supported yet\n"); + + rc = SMB2_sess_establish_session(sess_data); +out_put_spnego_key: + key_invalidate(spnego_key); + key_put(spnego_key); +out: + sess_data->result = rc; + sess_data->func = NULL; + SMB2_sess_free_buffer(sess_data); +} +#else +static void +SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) +{ + cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n"); + sess_data->result = -EOPNOTSUPP; + sess_data->func = NULL; +} +#endif + +static void +SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data); + +static void +SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) +{ + int rc; + struct cifs_ses *ses = sess_data->ses; + struct smb2_sess_setup_rsp *rsp = NULL; + char *ntlmssp_blob = NULL; + bool use_spnego = false; /* else use raw ntlmssp */ + u16 blob_length = 0; + /* - * No tcon so can't do - * cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_fail[SMB2...]); + * If memory allocation is successful, caller of this function + * frees it. */ - if (rc != 0) - goto ssetup_exit; + ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL); + if (!ses->ntlmssp) { + rc = -ENOMEM; + goto out_err; + } + ses->ntlmssp->sesskey_per_smbsess = true; + + rc = SMB2_sess_alloc_buffer(sess_data); + if (rc) + goto out_err; + + ntlmssp_blob = kmalloc(sizeof(struct _NEGOTIATE_MESSAGE), + GFP_KERNEL); + if (ntlmssp_blob == NULL) { + rc = -ENOMEM; + goto out; + } + + build_ntlmssp_negotiate_blob(ntlmssp_blob, ses); + if (use_spnego) { + /* BB eventually need to add this */ + cifs_dbg(VFS, "spnego not supported for SMB2 yet\n"); + rc = -EOPNOTSUPP; + goto out; + } else { + blob_length = sizeof(struct _NEGOTIATE_MESSAGE); + /* with raw NTLMSSP we don't encapsulate in SPNEGO */ + } + sess_data->iov[1].iov_base = ntlmssp_blob; + sess_data->iov[1].iov_len = blob_length; + + rc = SMB2_sess_sendreceive(sess_data); + rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; + + /* If true, rc here is expected and not an error */ + if (sess_data->buf0_type != CIFS_NO_BUFFER && + rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) + rc = 0; + + if (rc) + goto out; + + if (offsetof(struct smb2_sess_setup_rsp, Buffer) - 4 != + le16_to_cpu(rsp->SecurityBufferOffset)) { + cifs_dbg(VFS, "Invalid security buffer offset %d\n", + le16_to_cpu(rsp->SecurityBufferOffset)); + rc = -EIO; + goto out; + } + rc = decode_ntlmssp_challenge(rsp->Buffer, + le16_to_cpu(rsp->SecurityBufferLength), ses); + if (rc) + goto out; + + cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n"); + + ses->Suid = rsp->hdr.SessionId; ses->session_flags = le16_to_cpu(rsp->SessionFlags); if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) cifs_dbg(VFS, "SMB3 encryption not supported yet\n"); -ssetup_exit: - free_rsp_buf(resp_buftype, rsp); - - /* if ntlmssp, and negotiate succeeded, proceed to authenticate phase */ - if ((phase == NtLmChallenge) && (rc == 0)) - goto ssetup_ntlmssp_authenticate; +out: + kfree(ntlmssp_blob); + SMB2_sess_free_buffer(sess_data); if (!rc) { - mutex_lock(&server->srv_mutex); - if (server->sign && server->ops->generate_signingkey) { - rc = server->ops->generate_signingkey(ses); - kfree(ses->auth_key.response); - ses->auth_key.response = NULL; - if (rc) { - cifs_dbg(FYI, - "SMB3 session key generation failed\n"); - mutex_unlock(&server->srv_mutex); - goto keygen_exit; - } - } - if (!server->session_estab) { - server->sequence_number = 0x2; - server->session_estab = true; - } - mutex_unlock(&server->srv_mutex); - - cifs_dbg(FYI, "SMB2/3 session established successfully\n"); - spin_lock(&GlobalMid_Lock); - ses->status = CifsGood; - ses->need_reconnect = false; - spin_unlock(&GlobalMid_Lock); + sess_data->result = 0; + sess_data->func = SMB2_sess_auth_rawntlmssp_authenticate; + return; } +out_err: + kfree(ses->ntlmssp); + ses->ntlmssp = NULL; + sess_data->result = rc; + sess_data->func = NULL; +} -keygen_exit: - if (!server->sign) { - kfree(ses->auth_key.response); - ses->auth_key.response = NULL; +static void +SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) +{ + int rc; + struct cifs_ses *ses = sess_data->ses; + struct smb2_sess_setup_req *req; + struct smb2_sess_setup_rsp *rsp = NULL; + unsigned char *ntlmssp_blob = NULL; + bool use_spnego = false; /* else use raw ntlmssp */ + u16 blob_length = 0; + + rc = SMB2_sess_alloc_buffer(sess_data); + if (rc) + goto out; + + req = (struct smb2_sess_setup_req *) sess_data->iov[0].iov_base; + req->hdr.SessionId = ses->Suid; + + rc = build_ntlmssp_auth_blob(&ntlmssp_blob, &blob_length, ses, + sess_data->nls_cp); + if (rc) { + cifs_dbg(FYI, "build_ntlmssp_auth_blob failed %d\n", rc); + goto out; } - if (spnego_key) { - key_invalidate(spnego_key); - key_put(spnego_key); + + if (use_spnego) { + /* BB eventually need to add this */ + cifs_dbg(VFS, "spnego not supported for SMB2 yet\n"); + rc = -EOPNOTSUPP; + goto out; } + sess_data->iov[1].iov_base = ntlmssp_blob; + sess_data->iov[1].iov_len = blob_length; + + rc = SMB2_sess_sendreceive(sess_data); + if (rc) + goto out; + + rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; + + ses->Suid = rsp->hdr.SessionId; + ses->session_flags = le16_to_cpu(rsp->SessionFlags); + if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) + cifs_dbg(VFS, "SMB3 encryption not supported yet\n"); + + rc = SMB2_sess_establish_session(sess_data); +out: + kfree(ntlmssp_blob); + SMB2_sess_free_buffer(sess_data); kfree(ses->ntlmssp); + ses->ntlmssp = NULL; + sess_data->result = rc; + sess_data->func = NULL; +} +static int +SMB2_select_sec(struct cifs_ses *ses, struct SMB2_sess_data *sess_data) +{ + if (ses->sectype != Kerberos && ses->sectype != RawNTLMSSP) + ses->sectype = RawNTLMSSP; + + switch (ses->sectype) { + case Kerberos: + sess_data->func = SMB2_auth_kerberos; + break; + case RawNTLMSSP: + sess_data->func = SMB2_sess_auth_rawntlmssp_negotiate; + break; + default: + cifs_dbg(VFS, "secType %d not supported!\n", ses->sectype); + return -EOPNOTSUPP; + } + + return 0; +} + +int +SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, + const struct nls_table *nls_cp) +{ + int rc = 0; + struct TCP_Server_Info *server = ses->server; + struct SMB2_sess_data *sess_data; + + cifs_dbg(FYI, "Session Setup\n"); + + if (!server) { + WARN(1, "%s: server is NULL!\n", __func__); + return -EIO; + } + + sess_data = kzalloc(sizeof(struct SMB2_sess_data), GFP_KERNEL); + if (!sess_data) + return -ENOMEM; + + rc = SMB2_select_sec(ses, sess_data); + if (rc) + goto out; + sess_data->xid = xid; + sess_data->ses = ses; + sess_data->buf0_type = CIFS_NO_BUFFER; + sess_data->nls_cp = (struct nls_table *) nls_cp; + + while (sess_data->func) + sess_data->func(sess_data); + + rc = sess_data->result; +out: + kfree(sess_data); return rc; } @@ -1164,7 +1335,7 @@ create_durable_v2_buf(struct cifs_fid *pfid) buf->dcontext.Timeout = 0; /* Should this be configurable by workload */ buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT); - get_random_bytes(buf->dcontext.CreateGuid, 16); + generate_random_uuid(buf->dcontext.CreateGuid); memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16); /* SMB2_CREATE_DURABLE_HANDLE_REQUEST is "DH2Q" */ @@ -2057,6 +2228,7 @@ smb2_async_readv(struct cifs_readdata *rdata) if (rdata->credits) { buf->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes, SMB2_MAX_BUFFER_SIZE)); + buf->CreditRequest = buf->CreditCharge; spin_lock(&server->req_lock); server->credits += rdata->credits - le16_to_cpu(buf->CreditCharge); @@ -2243,6 +2415,7 @@ smb2_async_writev(struct cifs_writedata *wdata, if (wdata->credits) { req->hdr.CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes, SMB2_MAX_BUFFER_SIZE)); + req->hdr.CreditRequest = req->hdr.CreditCharge; spin_lock(&server->req_lock); server->credits += wdata->credits - le16_to_cpu(req->hdr.CreditCharge); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index ff88d9feb01e..fd3709e8de33 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -276,7 +276,7 @@ struct smb2_sess_setup_req { __le32 Channel; __le16 SecurityBufferOffset; __le16 SecurityBufferLength; - __le64 PreviousSessionId; + __u64 PreviousSessionId; __u8 Buffer[1]; /* variable length GSS security buffer */ } __packed; diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 5e23f64c0804..20af5187ba63 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -33,7 +33,8 @@ #define MAX_EA_VALUE_SIZE 65535 #define CIFS_XATTR_CIFS_ACL "system.cifs_acl" - +#define CIFS_XATTR_ATTRIB "cifs.dosattrib" /* full name: user.cifs.dosattrib */ +#define CIFS_XATTR_CREATETIME "cifs.creationtime" /* user.cifs.creationtime */ /* BB need to add server (Samba e.g) support for security and trusted prefix */ enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT }; @@ -144,6 +145,54 @@ out: return rc; } +static int cifs_attrib_get(struct dentry *dentry, + struct inode *inode, void *value, + size_t size) +{ + ssize_t rc; + __u32 *pattribute; + + rc = cifs_revalidate_dentry_attr(dentry); + + if (rc) + return rc; + + if ((value == NULL) || (size == 0)) + return sizeof(__u32); + else if (size < sizeof(__u32)) + return -ERANGE; + + /* return dos attributes as pseudo xattr */ + pattribute = (__u32 *)value; + *pattribute = CIFS_I(inode)->cifsAttrs; + + return sizeof(__u32); +} + +static int cifs_creation_time_get(struct dentry *dentry, struct inode *inode, + void *value, size_t size) +{ + ssize_t rc; + __u64 * pcreatetime; + + rc = cifs_revalidate_dentry_attr(dentry); + if (rc) + return rc; + + if ((value == NULL) || (size == 0)) + return sizeof(__u64); + else if (size < sizeof(__u64)) + return -ERANGE; + + /* return dos attributes as pseudo xattr */ + pcreatetime = (__u64 *)value; + *pcreatetime = CIFS_I(inode)->createtime; + return sizeof(__u64); + + return rc; +} + + static int cifs_xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size) @@ -168,10 +217,19 @@ static int cifs_xattr_get(const struct xattr_handler *handler, rc = -ENOMEM; goto out; } - /* return dos attributes as pseudo xattr */ + /* return alt name if available as pseudo attr */ switch (handler->flags) { case XATTR_USER: + cifs_dbg(FYI, "%s:querying user xattr %s\n", __func__, name); + if (strcmp(name, CIFS_XATTR_ATTRIB) == 0) { + rc = cifs_attrib_get(dentry, inode, value, size); + break; + } else if (strcmp(name, CIFS_XATTR_CREATETIME) == 0) { + rc = cifs_creation_time_get(dentry, inode, value, size); + break; + } + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto out; diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index c1e9f29c924c..f2d7402abe02 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1209,6 +1209,8 @@ COMPATIBLE_IOCTL(WDIOC_SETOPTIONS) COMPATIBLE_IOCTL(WDIOC_KEEPALIVE) COMPATIBLE_IOCTL(WDIOC_SETTIMEOUT) COMPATIBLE_IOCTL(WDIOC_GETTIMEOUT) +COMPATIBLE_IOCTL(WDIOC_SETPRETIMEOUT) +COMPATIBLE_IOCTL(WDIOC_GETPRETIMEOUT) /* Big R */ COMPATIBLE_IOCTL(RNDGETENTCNT) COMPATIBLE_IOCTL(RNDADDTOENTCNT) diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index d42ff527ab21..d8072bc074a4 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -778,7 +778,7 @@ try_again: fail: EXOFS_DBGMSG("Error: writepage_strip(0x%lx, 0x%lx)=>%d\n", inode->i_ino, page->index, ret); - set_bit(AS_EIO, &page->mapping->flags); + mapping_set_error(page->mapping, -EIO); unlock_page(page); return ret; } diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 207ba8d627ca..a4b531be9168 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -428,10 +428,10 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, if (!nop || !nop->fh_to_dentry) return ERR_PTR(-ESTALE); result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type); - if (!result) - result = ERR_PTR(-ESTALE); - if (IS_ERR(result)) - return result; + if (PTR_ERR(result) == -ENOMEM) + return ERR_CAST(result); + if (IS_ERR_OR_NULL(result)) + return ERR_PTR(-ESTALE); if (d_is_dir(result)) { /* @@ -541,6 +541,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, err_result: dput(result); + if (err != -ENOMEM) + err = -ESTALE; return ERR_PTR(err); } EXPORT_SYMBOL_GPL(exportfs_decode_fh); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index b4cbee936cf8..0094923e5ebf 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -88,7 +88,7 @@ static void ext4_finish_bio(struct bio *bio) if (bio->bi_error) { SetPageError(page); - set_bit(AS_EIO, &page->mapping->flags); + mapping_set_error(page->mapping, -EIO); } bh = head = page_buffers(page); /* diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0d0177c9149c..9ae194fd2fdb 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -75,7 +75,7 @@ static void f2fs_write_end_io(struct bio *bio) fscrypt_pullback_bio_page(&page, true); if (unlikely(bio->bi_error)) { - set_bit(AS_EIO, &page->mapping->flags); + mapping_set_error(page->mapping, -EIO); f2fs_stop_checkpoint(sbi, true); } end_page_writeback(page); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 5bb565f9989c..31f8ca046639 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -269,8 +269,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal, * filemap_fdatawait_range(), set it again so * that user process can get -EIO from fsync(). */ - set_bit(AS_EIO, - &jinode->i_vfs_inode->i_mapping->flags); + mapping_set_error(jinode->i_vfs_inode->i_mapping, -EIO); if (!ret) ret = err; diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index dcd96aac02f5..cf4c636ff4da 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -110,8 +110,9 @@ static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a, * kn_to: /n1/n2/n3 [depth=3] * result: /../.. * - * return value: length of the string. If greater than buflen, - * then contents of buf are undefined. On error, -1 is returned. + * Returns the length of the full path. If the full length is equal to or + * greater than @buflen, @buf contains the truncated path with the trailing + * '\0'. On error, -errno is returned. */ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, struct kernfs_node *kn_from, @@ -119,9 +120,8 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, { struct kernfs_node *kn, *common; const char parent_str[] = "/.."; - size_t depth_from, depth_to, len = 0, nlen = 0; - char *p; - int i; + size_t depth_from, depth_to, len = 0; + int i, j; if (!kn_from) kn_from = kernfs_root(kn_to)->kn; @@ -131,7 +131,7 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, common = kernfs_common_ancestor(kn_from, kn_to); if (WARN_ON(!common)) - return -1; + return -EINVAL; depth_to = kernfs_depth(common, kn_to); depth_from = kernfs_depth(common, kn_from); @@ -144,22 +144,16 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, len < buflen ? buflen - len : 0); /* Calculate how many bytes we need for the rest */ - for (kn = kn_to; kn != common; kn = kn->parent) - nlen += strlen(kn->name) + 1; - - if (len + nlen >= buflen) - return len + nlen; - - p = buf + len + nlen; - *p = '\0'; - for (kn = kn_to; kn != common; kn = kn->parent) { - size_t tmp = strlen(kn->name); - p -= tmp; - memcpy(p, kn->name, tmp); - *(--p) = '/'; + for (i = depth_to - 1; i >= 0; i--) { + for (kn = kn_to, j = 0; j < i; j++) + kn = kn->parent; + len += strlcpy(buf + len, "/", + len < buflen ? buflen - len : 0); + len += strlcpy(buf + len, kn->name, + len < buflen ? buflen - len : 0); } - return len + nlen; + return len; } /** @@ -186,29 +180,6 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) } /** - * kernfs_path_len - determine the length of the full path of a given node - * @kn: kernfs_node of interest - * - * The returned length doesn't include the space for the terminating '\0'. - */ -size_t kernfs_path_len(struct kernfs_node *kn) -{ - size_t len = 0; - unsigned long flags; - - spin_lock_irqsave(&kernfs_rename_lock, flags); - - do { - len += strlen(kn->name) + 1; - kn = kn->parent; - } while (kn && kn->parent); - - spin_unlock_irqrestore(&kernfs_rename_lock, flags); - - return len; -} - -/** * kernfs_path_from_node - build path of node @to relative to @from. * @from: parent kernfs_node relative to which we need to build the path * @to: kernfs_node of interest @@ -220,8 +191,9 @@ size_t kernfs_path_len(struct kernfs_node *kn) * path (which includes '..'s) as needed to reach from @from to @to is * returned. * - * If @buf isn't long enough, the return value will be greater than @buflen - * and @buf contents are undefined. + * Returns the length of the full path. If the full length is equal to or + * greater than @buflen, @buf contains the truncated path with the trailing + * '\0'. On error, -errno is returned. */ int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from, char *buf, size_t buflen) @@ -237,28 +209,6 @@ int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from, EXPORT_SYMBOL_GPL(kernfs_path_from_node); /** - * kernfs_path - build full path of a given node - * @kn: kernfs_node of interest - * @buf: buffer to copy @kn's name into - * @buflen: size of @buf - * - * Builds and returns the full path of @kn in @buf of @buflen bytes. The - * path is built from the end of @buf so the returned pointer usually - * doesn't match @buf. If @buf isn't long enough, @buf is nul terminated - * and %NULL is returned. - */ -char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen) -{ - int ret; - - ret = kernfs_path_from_node(kn, NULL, buf, buflen); - if (ret < 0 || ret >= buflen) - return NULL; - return buf; -} -EXPORT_SYMBOL_GPL(kernfs_path); - -/** * pr_cont_kernfs_name - pr_cont name of a kernfs_node * @kn: kernfs_node of interest * diff --git a/fs/lockd/procfs.h b/fs/lockd/procfs.h index 2257a1311027..184a15edd18d 100644 --- a/fs/lockd/procfs.h +++ b/fs/lockd/procfs.h @@ -6,8 +6,6 @@ #ifndef _LOCKD_PROCFS_H #define _LOCKD_PROCFS_H -#include <linux/kconfig.h> - #if IS_ENABLED(CONFIG_PROC_FS) int lockd_create_procfs(void); void lockd_remove_procfs(void); diff --git a/fs/namei.c b/fs/namei.c index a7f601cd521a..5b4eed221530 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4668,6 +4668,31 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) } EXPORT_SYMBOL(generic_readlink); +/** + * vfs_get_link - get symlink body + * @dentry: dentry on which to get symbolic link + * @done: caller needs to free returned data with this + * + * Calls security hook and i_op->get_link() on the supplied inode. + * + * It does not touch atime. That's up to the caller if necessary. + * + * Does not work on "special" symlinks like /proc/$$/fd/N + */ +const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done) +{ + const char *res = ERR_PTR(-EINVAL); + struct inode *inode = d_inode(dentry); + + if (d_is_symlink(dentry)) { + res = ERR_PTR(security_inode_readlink(dentry)); + if (!res) + res = inode->i_op->get_link(dentry, inode, done); + } + return res; +} +EXPORT_SYMBOL(vfs_get_link); + /* get the link contents into pagecache */ const char *page_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *callback) diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c index 5f7b053720ee..6de15709d024 100644 --- a/fs/nfs/cache_lib.c +++ b/fs/nfs/cache_lib.c @@ -76,7 +76,7 @@ static void nfs_dns_cache_revisit(struct cache_deferred_req *d, int toomany) dreq = container_of(d, struct nfs_cache_defer_req, deferred_req); - complete_all(&dreq->completion); + complete(&dreq->completion); nfs_cache_defer_req_put(dreq); } diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 52a28311e2a4..532d8e242d4d 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -31,8 +31,6 @@ struct nfs_callback_data { unsigned int users; struct svc_serv *serv; - struct svc_rqst *rqst; - struct task_struct *task; }; static struct nfs_callback_data nfs_callback_info[NFS4_MAX_MINOR_VERSION + 1]; @@ -89,15 +87,6 @@ nfs4_callback_svc(void *vrqstp) return 0; } -/* - * Prepare to bring up the NFSv4 callback service - */ -static struct svc_rqst * -nfs4_callback_up(struct svc_serv *serv) -{ - return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); -} - #if defined(CONFIG_NFS_V4_1) /* * The callback service for NFSv4.1 callbacks @@ -139,29 +128,6 @@ nfs41_callback_svc(void *vrqstp) return 0; } -/* - * Bring up the NFSv4.1 callback service - */ -static struct svc_rqst * -nfs41_callback_up(struct svc_serv *serv) -{ - struct svc_rqst *rqstp; - - INIT_LIST_HEAD(&serv->sv_cb_list); - spin_lock_init(&serv->sv_cb_lock); - init_waitqueue_head(&serv->sv_cb_waitq); - rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); - dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp)); - return rqstp; -} - -static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv, - struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp)) -{ - *rqstpp = nfs41_callback_up(serv); - *callback_svc = nfs41_callback_svc; -} - static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, struct svc_serv *serv) { @@ -173,13 +139,6 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, xprt->bc_serv = serv; } #else -static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv, - struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp)) -{ - *rqstpp = ERR_PTR(-ENOTSUPP); - *callback_svc = ERR_PTR(-ENOTSUPP); -} - static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, struct svc_serv *serv) { @@ -189,45 +148,22 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt, struct svc_serv *serv) { - struct svc_rqst *rqstp; - int (*callback_svc)(void *vrqstp); - struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; + int nrservs = nfs_callback_nr_threads; int ret; nfs_callback_bc_serv(minorversion, xprt, serv); - if (cb_info->task) - return 0; + if (nrservs < NFS4_MIN_NR_CALLBACK_THREADS) + nrservs = NFS4_MIN_NR_CALLBACK_THREADS; - switch (minorversion) { - case 0: - /* v4.0 callback setup */ - rqstp = nfs4_callback_up(serv); - callback_svc = nfs4_callback_svc; - break; - default: - nfs_minorversion_callback_svc_setup(serv, - &rqstp, &callback_svc); - } - - if (IS_ERR(rqstp)) - return PTR_ERR(rqstp); - - svc_sock_update_bufs(serv); + if (serv->sv_nrthreads-1 == nrservs) + return 0; - cb_info->serv = serv; - cb_info->rqst = rqstp; - cb_info->task = kthread_create(callback_svc, cb_info->rqst, - "nfsv4.%u-svc", minorversion); - if (IS_ERR(cb_info->task)) { - ret = PTR_ERR(cb_info->task); - svc_exit_thread(cb_info->rqst); - cb_info->rqst = NULL; - cb_info->task = NULL; + ret = serv->sv_ops->svo_setup(serv, NULL, nrservs); + if (ret) { + serv->sv_ops->svo_setup(serv, NULL, 0); return ret; } - rqstp->rq_task = cb_info->task; - wake_up_process(cb_info->task); dprintk("nfs_callback_up: service started\n"); return 0; } @@ -281,19 +217,41 @@ err_bind: return ret; } -static struct svc_serv_ops nfs_cb_sv_ops = { +static struct svc_serv_ops nfs40_cb_sv_ops = { + .svo_function = nfs4_callback_svc, .svo_enqueue_xprt = svc_xprt_do_enqueue, + .svo_setup = svc_set_num_threads, + .svo_module = THIS_MODULE, +}; +#if defined(CONFIG_NFS_V4_1) +static struct svc_serv_ops nfs41_cb_sv_ops = { + .svo_function = nfs41_callback_svc, + .svo_enqueue_xprt = svc_xprt_do_enqueue, + .svo_setup = svc_set_num_threads, + .svo_module = THIS_MODULE, +}; + +struct svc_serv_ops *nfs4_cb_sv_ops[] = { + [0] = &nfs40_cb_sv_ops, + [1] = &nfs41_cb_sv_ops, +}; +#else +struct svc_serv_ops *nfs4_cb_sv_ops[] = { + [0] = &nfs40_cb_sv_ops, + [1] = NULL, }; +#endif static struct svc_serv *nfs_callback_create_svc(int minorversion) { struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; struct svc_serv *serv; + struct svc_serv_ops *sv_ops; /* * Check whether we're already up and running. */ - if (cb_info->task) { + if (cb_info->serv) { /* * Note: increase service usage, because later in case of error * svc_destroy() will be called. @@ -302,6 +260,17 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) return cb_info->serv; } + switch (minorversion) { + case 0: + sv_ops = nfs4_cb_sv_ops[0]; + break; + default: + sv_ops = nfs4_cb_sv_ops[1]; + } + + if (sv_ops == NULL) + return ERR_PTR(-ENOTSUPP); + /* * Sanity check: if there's no task, * we should be the first user ... @@ -310,11 +279,12 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n", cb_info->users); - serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, &nfs_cb_sv_ops); + serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops); if (!serv) { printk(KERN_ERR "nfs_callback_create_svc: create service failed\n"); return ERR_PTR(-ENOMEM); } + cb_info->serv = serv; /* As there is only one thread we need to over-ride the * default maximum of 80 connections */ @@ -357,6 +327,8 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) * thread exits. */ err_net: + if (!cb_info->users) + cb_info->serv = NULL; svc_destroy(serv); err_create: mutex_unlock(&nfs_callback_mutex); @@ -374,18 +346,18 @@ err_start: void nfs_callback_down(int minorversion, struct net *net) { struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; + struct svc_serv *serv; mutex_lock(&nfs_callback_mutex); - nfs_callback_down_net(minorversion, cb_info->serv, net); + serv = cb_info->serv; + nfs_callback_down_net(minorversion, serv, net); cb_info->users--; - if (cb_info->users == 0 && cb_info->task != NULL) { - kthread_stop(cb_info->task); - dprintk("nfs_callback_down: service stopped\n"); - svc_exit_thread(cb_info->rqst); + if (cb_info->users == 0) { + svc_get(serv); + serv->sv_ops->svo_setup(serv, NULL, 0); + svc_destroy(serv); dprintk("nfs_callback_down: service destroyed\n"); cb_info->serv = NULL; - cb_info->rqst = NULL; - cb_info->task = NULL; } mutex_unlock(&nfs_callback_mutex); } diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index 5fe1cecbf9f0..c701c308fac5 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -179,6 +179,15 @@ extern __be32 nfs4_callback_devicenotify( struct cb_devicenotifyargs *args, void *dummy, struct cb_process_state *cps); +struct cb_notify_lock_args { + struct nfs_fh cbnl_fh; + struct nfs_lowner cbnl_owner; + bool cbnl_valid; +}; + +extern __be32 nfs4_callback_notify_lock(struct cb_notify_lock_args *args, + void *dummy, + struct cb_process_state *cps); #endif /* CONFIG_NFS_V4_1 */ extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *); extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, @@ -198,6 +207,9 @@ extern void nfs_callback_down(int minorversion, struct net *net); #define NFS41_BC_MIN_CALLBACKS 1 #define NFS41_BC_MAX_CALLBACKS 1 +#define NFS4_MIN_NR_CALLBACK_THREADS 1 + extern unsigned int nfs_callback_set_tcpport; +extern unsigned short nfs_callback_nr_threads; #endif /* __LINUX_FS_NFS_CALLBACK_H */ diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index f953ef6b2f2e..e9aa235e9d10 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -628,4 +628,20 @@ out: dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); return status; } + +__be32 nfs4_callback_notify_lock(struct cb_notify_lock_args *args, void *dummy, + struct cb_process_state *cps) +{ + if (!cps->clp) /* set in cb_sequence */ + return htonl(NFS4ERR_OP_NOT_IN_SESSION); + + dprintk_rcu("NFS: CB_NOTIFY_LOCK request from %s\n", + rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + + /* Don't wake anybody if the string looked bogus */ + if (args->cbnl_valid) + __wake_up(&cps->clp->cl_lock_waitq, TASK_NORMAL, 0, args); + + return htonl(NFS4_OK); +} #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 656f68f7fe53..eb094c6011d8 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -35,6 +35,7 @@ (1 + 3) * 4) // seqid, 3 slotids #define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) #define CB_OP_RECALLSLOT_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +#define CB_OP_NOTIFY_LOCK_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) #endif /* CONFIG_NFS_V4_1 */ #define NFSDBG_FACILITY NFSDBG_CALLBACK @@ -72,7 +73,7 @@ static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy) return xdr_ressize_check(rqstp, p); } -static __be32 *read_buf(struct xdr_stream *xdr, int nbytes) +static __be32 *read_buf(struct xdr_stream *xdr, size_t nbytes) { __be32 *p; @@ -534,6 +535,49 @@ static __be32 decode_recallslot_args(struct svc_rqst *rqstp, return 0; } +static __be32 decode_lockowner(struct xdr_stream *xdr, struct cb_notify_lock_args *args) +{ + __be32 *p; + unsigned int len; + + p = read_buf(xdr, 12); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); + + p = xdr_decode_hyper(p, &args->cbnl_owner.clientid); + len = be32_to_cpu(*p); + + p = read_buf(xdr, len); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); + + /* Only try to decode if the length is right */ + if (len == 20) { + p += 2; /* skip "lock id:" */ + args->cbnl_owner.s_dev = be32_to_cpu(*p++); + xdr_decode_hyper(p, &args->cbnl_owner.id); + args->cbnl_valid = true; + } else { + args->cbnl_owner.s_dev = 0; + args->cbnl_owner.id = 0; + args->cbnl_valid = false; + } + return 0; +} + +static __be32 decode_notify_lock_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_notify_lock_args *args) +{ + __be32 status; + + status = decode_fh(xdr, &args->cbnl_fh); + if (unlikely(status != 0)) + goto out; + status = decode_lockowner(xdr, args); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} + #endif /* CONFIG_NFS_V4_1 */ static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) @@ -746,6 +790,7 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) case OP_CB_RECALL_SLOT: case OP_CB_LAYOUTRECALL: case OP_CB_NOTIFY_DEVICEID: + case OP_CB_NOTIFY_LOCK: *op = &callback_ops[op_nr]; break; @@ -753,7 +798,6 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) case OP_CB_PUSH_DELEG: case OP_CB_RECALLABLE_OBJ_AVAIL: case OP_CB_WANTS_CANCELLED: - case OP_CB_NOTIFY_LOCK: return htonl(NFS4ERR_NOTSUPP); default: @@ -1006,6 +1050,11 @@ static struct callback_op callback_ops[] = { .decode_args = (callback_decode_arg_t)decode_recallslot_args, .res_maxsize = CB_OP_RECALLSLOT_RES_MAXSZ, }, + [OP_CB_NOTIFY_LOCK] = { + .process_op = (callback_process_op_t)nfs4_callback_notify_lock, + .decode_args = (callback_decode_arg_t)decode_notify_lock_args, + .res_maxsize = CB_OP_NOTIFY_LOCK_RES_MAXSZ, + }, #endif /* CONFIG_NFS_V4_1 */ }; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 1e106780a237..7555ba889d1f 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -313,7 +313,10 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat continue; /* Match the full socket address */ if (!rpc_cmp_addr_port(sap, clap)) - continue; + /* Match all xprt_switch full socket addresses */ + if (!rpc_clnt_xprt_switch_has_addr(clp->cl_rpcclient, + sap)) + continue; atomic_inc(&clp->cl_count); return clp; @@ -785,7 +788,8 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs } fsinfo.fattr = fattr; - fsinfo.layouttype = 0; + fsinfo.nlayouttypes = 0; + memset(fsinfo.layouttype, 0, sizeof(fsinfo.layouttype)); error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); if (error < 0) goto out_error; @@ -1078,7 +1082,7 @@ void nfs_clients_init(struct net *net) idr_init(&nn->cb_ident_idr); #endif spin_lock_init(&nn->nfs_client_lock); - nn->boot_time = CURRENT_TIME; + nn->boot_time = ktime_get_real(); } #ifdef CONFIG_PROC_FS diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 322c2585bc34..dff600ae0d74 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -41,6 +41,17 @@ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation) set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags); } +static bool +nfs4_is_valid_delegation(const struct nfs_delegation *delegation, + fmode_t flags) +{ + if (delegation != NULL && (delegation->type & flags) == flags && + !test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) && + !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) + return true; + return false; +} + static int nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark) { @@ -50,8 +61,7 @@ nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark) flags &= FMODE_READ|FMODE_WRITE; rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); - if (delegation != NULL && (delegation->type & flags) == flags && - !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { + if (nfs4_is_valid_delegation(delegation, flags)) { if (mark) nfs_mark_delegation_referenced(delegation); ret = 1; @@ -185,15 +195,13 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, rcu_read_unlock(); put_rpccred(oldcred); trace_nfs4_reclaim_delegation(inode, res->delegation_type); - } else { - /* We appear to have raced with a delegation return. */ - spin_unlock(&delegation->lock); - rcu_read_unlock(); - nfs_inode_set_delegation(inode, cred, res); + return; } - } else { - rcu_read_unlock(); + /* We appear to have raced with a delegation return. */ + spin_unlock(&delegation->lock); } + rcu_read_unlock(); + nfs_inode_set_delegation(inode, cred, res); } static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) @@ -642,28 +650,49 @@ static void nfs_client_mark_return_unused_delegation_types(struct nfs_client *cl rcu_read_unlock(); } -static void nfs_revoke_delegation(struct inode *inode) +static void nfs_mark_delegation_revoked(struct nfs_server *server, + struct nfs_delegation *delegation) +{ + set_bit(NFS_DELEGATION_REVOKED, &delegation->flags); + delegation->stateid.type = NFS4_INVALID_STATEID_TYPE; + nfs_mark_return_delegation(server, delegation); +} + +static bool nfs_revoke_delegation(struct inode *inode, + const nfs4_stateid *stateid) { struct nfs_delegation *delegation; + nfs4_stateid tmp; + bool ret = false; + rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); - if (delegation != NULL) { - set_bit(NFS_DELEGATION_REVOKED, &delegation->flags); - nfs_mark_return_delegation(NFS_SERVER(inode), delegation); - } + if (delegation == NULL) + goto out; + if (stateid == NULL) { + nfs4_stateid_copy(&tmp, &delegation->stateid); + stateid = &tmp; + } else if (!nfs4_stateid_match(stateid, &delegation->stateid)) + goto out; + nfs_mark_delegation_revoked(NFS_SERVER(inode), delegation); + ret = true; +out: rcu_read_unlock(); + if (ret) + nfs_inode_find_state_and_recover(inode, stateid); + return ret; } -void nfs_remove_bad_delegation(struct inode *inode) +void nfs_remove_bad_delegation(struct inode *inode, + const nfs4_stateid *stateid) { struct nfs_delegation *delegation; - nfs_revoke_delegation(inode); + if (!nfs_revoke_delegation(inode, stateid)) + return; delegation = nfs_inode_detach_delegation(inode); - if (delegation) { - nfs_inode_find_state_and_recover(inode, &delegation->stateid); + if (delegation) nfs_free_delegation(delegation); - } } EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation); @@ -786,8 +815,15 @@ static void nfs_delegation_mark_reclaim_server(struct nfs_server *server) { struct nfs_delegation *delegation; - list_for_each_entry_rcu(delegation, &server->delegations, super_list) + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + /* + * If the delegation may have been admin revoked, then we + * cannot reclaim it. + */ + if (test_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags)) + continue; set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); + } } /** @@ -851,6 +887,141 @@ restart: rcu_read_unlock(); } +static inline bool nfs4_server_rebooted(const struct nfs_client *clp) +{ + return (clp->cl_state & (BIT(NFS4CLNT_CHECK_LEASE) | + BIT(NFS4CLNT_LEASE_EXPIRED) | + BIT(NFS4CLNT_SESSION_RESET))) != 0; +} + +static void nfs_mark_test_expired_delegation(struct nfs_server *server, + struct nfs_delegation *delegation) +{ + if (delegation->stateid.type == NFS4_INVALID_STATEID_TYPE) + return; + clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); + set_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags); + set_bit(NFS4CLNT_DELEGATION_EXPIRED, &server->nfs_client->cl_state); +} + +static void nfs_inode_mark_test_expired_delegation(struct nfs_server *server, + struct inode *inode) +{ + struct nfs_delegation *delegation; + + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation) + nfs_mark_test_expired_delegation(server, delegation); + rcu_read_unlock(); + +} + +static void nfs_delegation_mark_test_expired_server(struct nfs_server *server) +{ + struct nfs_delegation *delegation; + + list_for_each_entry_rcu(delegation, &server->delegations, super_list) + nfs_mark_test_expired_delegation(server, delegation); +} + +/** + * nfs_mark_test_expired_all_delegations - mark all delegations for testing + * @clp: nfs_client to process + * + * Iterates through all the delegations associated with this server and + * marks them as needing to be checked for validity. + */ +void nfs_mark_test_expired_all_delegations(struct nfs_client *clp) +{ + struct nfs_server *server; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs_delegation_mark_test_expired_server(server); + rcu_read_unlock(); +} + +/** + * nfs_reap_expired_delegations - reap expired delegations + * @clp: nfs_client to process + * + * Iterates through all the delegations associated with this server and + * checks if they have may have been revoked. This function is usually + * expected to be called in cases where the server may have lost its + * lease. + */ +void nfs_reap_expired_delegations(struct nfs_client *clp) +{ + const struct nfs4_minor_version_ops *ops = clp->cl_mvops; + struct nfs_delegation *delegation; + struct nfs_server *server; + struct inode *inode; + struct rpc_cred *cred; + nfs4_stateid stateid; + +restart: + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + list_for_each_entry_rcu(delegation, &server->delegations, + super_list) { + if (test_bit(NFS_DELEGATION_RETURNING, + &delegation->flags)) + continue; + if (test_bit(NFS_DELEGATION_TEST_EXPIRED, + &delegation->flags) == 0) + continue; + if (!nfs_sb_active(server->super)) + continue; + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) { + rcu_read_unlock(); + nfs_sb_deactive(server->super); + goto restart; + } + cred = get_rpccred_rcu(delegation->cred); + nfs4_stateid_copy(&stateid, &delegation->stateid); + clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags); + rcu_read_unlock(); + if (cred != NULL && + ops->test_and_free_expired(server, &stateid, cred) < 0) { + nfs_revoke_delegation(inode, &stateid); + nfs_inode_find_state_and_recover(inode, &stateid); + } + put_rpccred(cred); + if (nfs4_server_rebooted(clp)) { + nfs_inode_mark_test_expired_delegation(server,inode); + iput(inode); + nfs_sb_deactive(server->super); + return; + } + iput(inode); + nfs_sb_deactive(server->super); + goto restart; + } + } + rcu_read_unlock(); +} + +void nfs_inode_find_delegation_state_and_recover(struct inode *inode, + const nfs4_stateid *stateid) +{ + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct nfs_delegation *delegation; + bool found = false; + + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation && + nfs4_stateid_match_other(&delegation->stateid, stateid)) { + nfs_mark_test_expired_delegation(NFS_SERVER(inode), delegation); + found = true; + } + rcu_read_unlock(); + if (found) + nfs4_schedule_state_manager(clp); +} + /** * nfs_delegations_present - check for existence of delegations * @clp: client state handle @@ -893,7 +1064,7 @@ bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, flags &= FMODE_READ|FMODE_WRITE; rcu_read_lock(); delegation = rcu_dereference(nfsi->delegation); - ret = (delegation != NULL && (delegation->type & flags) == flags); + ret = nfs4_is_valid_delegation(delegation, flags); if (ret) { nfs4_stateid_copy(dst, &delegation->stateid); nfs_mark_delegation_referenced(delegation); diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 64724d252a79..e9d555796873 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -32,6 +32,7 @@ enum { NFS_DELEGATION_REFERENCED, NFS_DELEGATION_RETURNING, NFS_DELEGATION_REVOKED, + NFS_DELEGATION_TEST_EXPIRED, }; int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); @@ -47,11 +48,14 @@ void nfs_expire_unused_delegation_types(struct nfs_client *clp, fmode_t flags); void nfs_expire_unreferenced_delegations(struct nfs_client *clp); int nfs_client_return_marked_delegations(struct nfs_client *clp); int nfs_delegations_present(struct nfs_client *clp); -void nfs_remove_bad_delegation(struct inode *inode); +void nfs_remove_bad_delegation(struct inode *inode, const nfs4_stateid *stateid); void nfs_delegation_mark_reclaim(struct nfs_client *clp); void nfs_delegation_reap_unclaimed(struct nfs_client *clp); +void nfs_mark_test_expired_all_delegations(struct nfs_client *clp); +void nfs_reap_expired_delegations(struct nfs_client *clp); + /* NFSv4 delegation-related procedures */ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync); int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid, fmode_t type); @@ -62,6 +66,8 @@ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); int nfs4_have_delegation(struct inode *inode, fmode_t flags); int nfs4_check_delegation(struct inode *inode, fmode_t flags); bool nfs4_delegation_flush_on_close(const struct inode *inode); +void nfs_inode_find_delegation_state_and_recover(struct inode *inode, + const nfs4_stateid *stateid); #endif diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 06e0bf092ba9..5f1af4cd1a33 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -435,11 +435,11 @@ int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) return 0; nfsi = NFS_I(inode); - if (entry->fattr->fileid == nfsi->fileid) - return 1; - if (nfs_compare_fh(entry->fh, &nfsi->fh) == 0) - return 1; - return 0; + if (entry->fattr->fileid != nfsi->fileid) + return 0; + if (entry->fh->size && nfs_compare_fh(entry->fh, &nfsi->fh) != 0) + return 0; + return 1; } static @@ -496,6 +496,14 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) return; if (!(entry->fattr->valid & NFS_ATTR_FATTR_FSID)) return; + if (filename.len == 0) + return; + /* Validate that the name doesn't contain any illegal '\0' */ + if (strnlen(filename.name, filename.len) != filename.len) + return; + /* ...or '/' */ + if (strnchr(filename.name, filename.len, '/')) + return; if (filename.name[0] == '.') { if (filename.len == 1) return; @@ -517,6 +525,8 @@ again: &entry->fattr->fsid)) goto out; if (nfs_same_file(dentry, entry)) { + if (!entry->fh->size) + goto out; nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); status = nfs_refresh_inode(d_inode(dentry), entry->fattr); if (!status) @@ -529,6 +539,10 @@ again: goto again; } } + if (!entry->fh->size) { + d_lookup_done(dentry); + goto out; + } inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, entry->label); alias = d_splice_alias(inode, dentry); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 72b7d13ee3c6..bd81bcf3ffcf 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -387,7 +387,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq) dreq->iocb->ki_complete(dreq->iocb, res, 0); } - complete_all(&dreq->completion); + complete(&dreq->completion); nfs_direct_req_release(dreq); } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 2efbdde36c3e..9ea85ae23c32 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -520,7 +520,9 @@ const struct address_space_operations nfs_file_aops = { .invalidatepage = nfs_invalidate_page, .releasepage = nfs_release_page, .direct_IO = nfs_direct_IO, +#ifdef CONFIG_MIGRATION .migratepage = nfs_migrate_page, +#endif .launder_page = nfs_launder_page, .is_dirty_writeback = nfs_check_dirty_writeback, .error_remove_page = generic_error_remove_page, @@ -685,11 +687,6 @@ out_noconflict: goto out; } -static int do_vfs_lock(struct file *file, struct file_lock *fl) -{ - return locks_lock_file_wait(file, fl); -} - static int do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) { @@ -722,7 +719,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) if (!is_local) status = NFS_PROTO(inode)->lock(filp, cmd, fl); else - status = do_vfs_lock(filp, fl); + status = locks_lock_file_wait(filp, fl); return status; } @@ -747,7 +744,7 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) if (!is_local) status = NFS_PROTO(inode)->lock(filp, cmd, fl); else - status = do_vfs_lock(filp, fl); + status = locks_lock_file_wait(filp, fl); if (status < 0) goto out; diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 51b51369704c..98ace127bf86 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1080,7 +1080,7 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, case -NFS4ERR_BAD_STATEID: if (state == NULL) break; - nfs_remove_bad_delegation(state->inode); + nfs_remove_bad_delegation(state->inode, NULL); case -NFS4ERR_OPENMODE: if (state == NULL) break; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index a6acce663219..80bcc0befb07 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -534,12 +534,9 @@ void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) } #endif - #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, struct page *, struct page *, enum migrate_mode); -#else -#define nfs_migrate_page NULL #endif static inline int @@ -562,7 +559,6 @@ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); /* nfs4proc.c */ -extern void __nfs4_read_done_cb(struct nfs_pgio_header *); extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, const struct nfs_client_initdata *); extern int nfs40_walk_client_list(struct nfs_client *clp, @@ -571,6 +567,9 @@ extern int nfs40_walk_client_list(struct nfs_client *clp, extern int nfs41_walk_client_list(struct nfs_client *clp, struct nfs_client **result, struct rpc_cred *cred); +extern int nfs4_test_session_trunk(struct rpc_clnt *, + struct rpc_xprt *, + void *); static inline struct inode *nfs_igrab_and_active(struct inode *inode) { diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h index f0e06e4acbef..fbce0d885d4c 100644 --- a/fs/nfs/netns.h +++ b/fs/nfs/netns.h @@ -29,7 +29,7 @@ struct nfs_net { int cb_users[NFS4_MAX_MINOR_VERSION + 1]; #endif spinlock_t nfs_client_lock; - struct timespec boot_time; + ktime_t boot_time; #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc_nfsfs; #endif diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 64b43b4ad9dd..608501971fe0 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -443,6 +443,7 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server, task = rpc_run_task(&task_setup); if (IS_ERR(task)) return PTR_ERR(task); + rpc_put_task(task); return 0; } diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 9bf64eacba5b..9b3a82abab07 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -39,6 +39,7 @@ enum nfs4_client_state { NFS4CLNT_BIND_CONN_TO_SESSION, NFS4CLNT_MOVED, NFS4CLNT_LEASE_MOVED, + NFS4CLNT_DELEGATION_EXPIRED, }; #define NFS4_RENEW_TIMEOUT 0x01 @@ -57,8 +58,11 @@ struct nfs4_minor_version_ops { struct nfs_fsinfo *); void (*free_lock_state)(struct nfs_server *, struct nfs4_lock_state *); + int (*test_and_free_expired)(struct nfs_server *, + nfs4_stateid *, struct rpc_cred *); struct nfs_seqid * (*alloc_seqid)(struct nfs_seqid_counter *, gfp_t); + int (*session_trunk)(struct rpc_clnt *, struct rpc_xprt *, void *); const struct rpc_call_ops *call_sync_ops; const struct nfs4_state_recovery_ops *reboot_recovery_ops; const struct nfs4_state_recovery_ops *nograce_recovery_ops; @@ -156,6 +160,7 @@ enum { NFS_STATE_RECLAIM_NOGRACE, /* OPEN stateid needs to recover state */ NFS_STATE_POSIX_LOCKS, /* Posix locks are supported */ NFS_STATE_RECOVERY_FAILED, /* OPEN stateid state recovery failed */ + NFS_STATE_MAY_NOTIFY_LOCK, /* server may CB_NOTIFY_LOCK */ }; struct nfs4_state { @@ -203,6 +208,11 @@ struct nfs4_state_recovery_ops { struct rpc_cred *); }; +struct nfs4_add_xprt_data { + struct nfs_client *clp; + struct rpc_cred *cred; +}; + struct nfs4_state_maintenance_ops { int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *, unsigned); struct rpc_cred * (*get_state_renewal_cred_locked)(struct nfs_client *); @@ -278,6 +288,8 @@ extern int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo); extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync); +extern int nfs4_detect_session_trunking(struct nfs_client *clp, + struct nfs41_exchange_id_res *res, struct rpc_xprt *xprt); static inline bool is_ds_only_client(struct nfs_client *clp) @@ -439,7 +451,7 @@ extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp); extern int nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); extern int nfs4_schedule_migration_recovery(const struct nfs_server *); extern void nfs4_schedule_lease_moved_recovery(struct nfs_client *); -extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); +extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags, bool); extern void nfs41_handle_server_scope(struct nfs_client *, struct nfs41_server_scope **); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); @@ -471,6 +483,7 @@ extern struct nfs_subversion nfs_v4; struct dentry *nfs4_try_mount(int, const char *, struct nfs_mount_info *, struct nfs_subversion *); extern bool nfs4_disable_idmapping; extern unsigned short max_session_slots; +extern unsigned short max_session_cb_slots; extern unsigned short send_implementation_id; extern bool recover_lost_locks; diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index cd3b7cfdde16..074ac7131459 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -199,6 +199,9 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) clp->cl_minorversion = cl_init->minorversion; clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; clp->cl_mig_gen = 1; +#if IS_ENABLED(CONFIG_NFS_V4_1) + init_waitqueue_head(&clp->cl_lock_waitq); +#endif return clp; error: @@ -562,15 +565,15 @@ out: /* * Returns true if the client IDs match */ -static bool nfs4_match_clientids(struct nfs_client *a, struct nfs_client *b) +static bool nfs4_match_clientids(u64 a, u64 b) { - if (a->cl_clientid != b->cl_clientid) { + if (a != b) { dprintk("NFS: --> %s client ID %llx does not match %llx\n", - __func__, a->cl_clientid, b->cl_clientid); + __func__, a, b); return false; } dprintk("NFS: --> %s client ID %llx matches %llx\n", - __func__, a->cl_clientid, b->cl_clientid); + __func__, a, b); return true; } @@ -578,17 +581,15 @@ static bool nfs4_match_clientids(struct nfs_client *a, struct nfs_client *b) * Returns true if the server major ids match */ static bool -nfs4_check_clientid_trunking(struct nfs_client *a, struct nfs_client *b) +nfs4_check_serverowner_major_id(struct nfs41_server_owner *o1, + struct nfs41_server_owner *o2) { - struct nfs41_server_owner *o1 = a->cl_serverowner; - struct nfs41_server_owner *o2 = b->cl_serverowner; - if (o1->major_id_sz != o2->major_id_sz) goto out_major_mismatch; if (memcmp(o1->major_id, o2->major_id, o1->major_id_sz) != 0) goto out_major_mismatch; - dprintk("NFS: --> %s server owners match\n", __func__); + dprintk("NFS: --> %s server owner major IDs match\n", __func__); return true; out_major_mismatch: @@ -597,6 +598,100 @@ out_major_mismatch: return false; } +/* + * Returns true if server minor ids match + */ +static bool +nfs4_check_serverowner_minor_id(struct nfs41_server_owner *o1, + struct nfs41_server_owner *o2) +{ + /* Check eir_server_owner so_minor_id */ + if (o1->minor_id != o2->minor_id) + goto out_minor_mismatch; + + dprintk("NFS: --> %s server owner minor IDs match\n", __func__); + return true; + +out_minor_mismatch: + dprintk("NFS: --> %s server owner minor IDs do not match\n", __func__); + return false; +} + +/* + * Returns true if the server scopes match + */ +static bool +nfs4_check_server_scope(struct nfs41_server_scope *s1, + struct nfs41_server_scope *s2) +{ + if (s1->server_scope_sz != s2->server_scope_sz) + goto out_scope_mismatch; + if (memcmp(s1->server_scope, s2->server_scope, + s1->server_scope_sz) != 0) + goto out_scope_mismatch; + + dprintk("NFS: --> %s server scopes match\n", __func__); + return true; + +out_scope_mismatch: + dprintk("NFS: --> %s server scopes do not match\n", + __func__); + return false; +} + +/** + * nfs4_detect_session_trunking - Checks for session trunking. + * + * Called after a successful EXCHANGE_ID on a multi-addr connection. + * Upon success, add the transport. + * + * @clp: original mount nfs_client + * @res: result structure from an exchange_id using the original mount + * nfs_client with a new multi_addr transport + * + * Returns zero on success, otherwise -EINVAL + * + * Note: since the exchange_id for the new multi_addr transport uses the + * same nfs_client from the original mount, the cl_owner_id is reused, + * so eir_clientowner is the same. + */ +int nfs4_detect_session_trunking(struct nfs_client *clp, + struct nfs41_exchange_id_res *res, + struct rpc_xprt *xprt) +{ + /* Check eir_clientid */ + if (!nfs4_match_clientids(clp->cl_clientid, res->clientid)) + goto out_err; + + /* Check eir_server_owner so_major_id */ + if (!nfs4_check_serverowner_major_id(clp->cl_serverowner, + res->server_owner)) + goto out_err; + + /* Check eir_server_owner so_minor_id */ + if (!nfs4_check_serverowner_minor_id(clp->cl_serverowner, + res->server_owner)) + goto out_err; + + /* Check eir_server_scope */ + if (!nfs4_check_server_scope(clp->cl_serverscope, res->server_scope)) + goto out_err; + + /* Session trunking passed, add the xprt */ + rpc_clnt_xprt_switch_add_xprt(clp->cl_rpcclient, xprt); + + pr_info("NFS: %s: Session trunking succeeded for %s\n", + clp->cl_hostname, + xprt->address_strings[RPC_DISPLAY_ADDR]); + + return 0; +out_err: + pr_info("NFS: %s: Session trunking failed for %s\n", clp->cl_hostname, + xprt->address_strings[RPC_DISPLAY_ADDR]); + + return -EINVAL; +} + /** * nfs41_walk_client_list - Find nfs_client that matches a client/server owner * @@ -650,7 +745,7 @@ int nfs41_walk_client_list(struct nfs_client *new, if (pos->cl_cons_state != NFS_CS_READY) continue; - if (!nfs4_match_clientids(pos, new)) + if (!nfs4_match_clientids(pos->cl_clientid, new->cl_clientid)) continue; /* @@ -658,7 +753,8 @@ int nfs41_walk_client_list(struct nfs_client *new, * client id trunking. In either case, we want to fall back * to using the existing nfs_client. */ - if (!nfs4_check_clientid_trunking(pos, new)) + if (!nfs4_check_serverowner_major_id(pos->cl_serverowner, + new->cl_serverowner)) continue; /* Unlike NFSv4.0, we know that NFSv4.1 always uses the diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 0e327528a3ce..ad917bd72b38 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -99,8 +99,8 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, #ifdef CONFIG_NFS_V4_1 static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *, struct rpc_cred *); -static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *, - struct rpc_cred *); +static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *, + struct rpc_cred *, bool); #endif #ifdef CONFIG_NFS_V4_SECURITY_LABEL @@ -328,6 +328,33 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent kunmap_atomic(start); } +static void nfs4_test_and_free_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + struct rpc_cred *cred) +{ + const struct nfs4_minor_version_ops *ops = server->nfs_client->cl_mvops; + + ops->test_and_free_expired(server, stateid, cred); +} + +static void __nfs4_free_revoked_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + struct rpc_cred *cred) +{ + stateid->type = NFS4_REVOKED_STATEID_TYPE; + nfs4_test_and_free_stateid(server, stateid, cred); +} + +static void nfs4_free_revoked_stateid(struct nfs_server *server, + const nfs4_stateid *stateid, + struct rpc_cred *cred) +{ + nfs4_stateid tmp; + + nfs4_stateid_copy(&tmp, stateid); + __nfs4_free_revoked_stateid(server, &tmp, cred); +} + static long nfs4_update_delay(long *timeout) { long ret; @@ -370,13 +397,23 @@ static int nfs4_do_handle_exception(struct nfs_server *server, exception->delay = 0; exception->recovering = 0; exception->retry = 0; + + if (stateid == NULL && state != NULL) + stateid = &state->stateid; + switch(errorcode) { case 0: return 0; - case -NFS4ERR_OPENMODE: case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: + if (inode != NULL && stateid != NULL) { + nfs_inode_find_state_and_recover(inode, + stateid); + goto wait_on_recovery; + } + case -NFS4ERR_OPENMODE: if (inode) { int err; @@ -395,12 +432,6 @@ static int nfs4_do_handle_exception(struct nfs_server *server, if (ret < 0) break; goto wait_on_recovery; - case -NFS4ERR_EXPIRED: - if (state != NULL) { - ret = nfs4_schedule_stateid_recovery(server, state); - if (ret < 0) - break; - } case -NFS4ERR_STALE_STATEID: case -NFS4ERR_STALE_CLIENTID: nfs4_schedule_lease_recovery(clp); @@ -616,6 +647,7 @@ int nfs40_setup_sequence(struct nfs4_slot_table *tbl, } spin_unlock(&tbl->slot_tbl_lock); + slot->privileged = args->sa_privileged ? 1 : 0; args->sa_slot = slot; res->sr_slot = slot; @@ -723,12 +755,20 @@ static int nfs41_sequence_process(struct rpc_task *task, /* Check the SEQUENCE operation status */ switch (res->sr_status) { case 0: + /* If previous op on slot was interrupted and we reused + * the seq# and got a reply from the cache, then retry + */ + if (task->tk_status == -EREMOTEIO && interrupted) { + ++slot->seq_nr; + goto retry_nowait; + } /* Update the slot's sequence and clientid lease timer */ slot->seq_done = 1; clp = session->clp; do_renew_lease(clp, res->sr_timestamp); /* Check sequence flags */ - nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); + nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags, + !!slot->privileged); nfs41_update_target_slotid(slot->table, slot, res); break; case 1: @@ -875,6 +915,7 @@ int nfs41_setup_sequence(struct nfs4_session *session, } spin_unlock(&tbl->slot_tbl_lock); + slot->privileged = args->sa_privileged ? 1 : 0; args->sa_slot = slot; dprintk("<-- %s slotid=%u seqid=%u\n", __func__, @@ -1353,6 +1394,19 @@ static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode) nfs4_state_set_mode_locked(state, state->state | fmode); } +#ifdef CONFIG_NFS_V4_1 +static bool nfs_open_stateid_recover_openmode(struct nfs4_state *state) +{ + if (state->n_rdonly && !test_bit(NFS_O_RDONLY_STATE, &state->flags)) + return true; + if (state->n_wronly && !test_bit(NFS_O_WRONLY_STATE, &state->flags)) + return true; + if (state->n_rdwr && !test_bit(NFS_O_RDWR_STATE, &state->flags)) + return true; + return false; +} +#endif /* CONFIG_NFS_V4_1 */ + static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state) { struct nfs_client *clp = state->owner->so_server->nfs_client; @@ -1369,11 +1423,12 @@ static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state) } static bool nfs_need_update_open_stateid(struct nfs4_state *state, - nfs4_stateid *stateid) + const nfs4_stateid *stateid, nfs4_stateid *freeme) { if (test_and_set_bit(NFS_OPEN_STATE, &state->flags) == 0) return true; if (!nfs4_stateid_match_other(stateid, &state->open_stateid)) { + nfs4_stateid_copy(freeme, &state->open_stateid); nfs_test_and_clear_all_open_stateid(state); return true; } @@ -1437,7 +1492,9 @@ static void nfs_clear_open_stateid(struct nfs4_state *state, nfs4_schedule_state_manager(state->owner->so_server->nfs_client); } -static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) +static void nfs_set_open_stateid_locked(struct nfs4_state *state, + const nfs4_stateid *stateid, fmode_t fmode, + nfs4_stateid *freeme) { switch (fmode) { case FMODE_READ: @@ -1449,14 +1506,18 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid * case FMODE_READ|FMODE_WRITE: set_bit(NFS_O_RDWR_STATE, &state->flags); } - if (!nfs_need_update_open_stateid(state, stateid)) + if (!nfs_need_update_open_stateid(state, stateid, freeme)) return; if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) nfs4_stateid_copy(&state->stateid, stateid); nfs4_stateid_copy(&state->open_stateid, stateid); } -static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode) +static void __update_open_stateid(struct nfs4_state *state, + const nfs4_stateid *open_stateid, + const nfs4_stateid *deleg_stateid, + fmode_t fmode, + nfs4_stateid *freeme) { /* * Protect the call to nfs4_state_set_mode_locked and @@ -1469,16 +1530,22 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s set_bit(NFS_DELEGATED_STATE, &state->flags); } if (open_stateid != NULL) - nfs_set_open_stateid_locked(state, open_stateid, fmode); + nfs_set_open_stateid_locked(state, open_stateid, fmode, freeme); write_sequnlock(&state->seqlock); update_open_stateflags(state, fmode); spin_unlock(&state->owner->so_lock); } -static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *delegation, fmode_t fmode) +static int update_open_stateid(struct nfs4_state *state, + const nfs4_stateid *open_stateid, + const nfs4_stateid *delegation, + fmode_t fmode) { + struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs_client *clp = server->nfs_client; struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs_delegation *deleg_cur; + nfs4_stateid freeme = {0}; int ret = 0; fmode &= (FMODE_READ|FMODE_WRITE); @@ -1500,7 +1567,8 @@ static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stat goto no_delegation_unlock; nfs_mark_delegation_referenced(deleg_cur); - __update_open_stateid(state, open_stateid, &deleg_cur->stateid, fmode); + __update_open_stateid(state, open_stateid, &deleg_cur->stateid, + fmode, &freeme); ret = 1; no_delegation_unlock: spin_unlock(&deleg_cur->lock); @@ -1508,11 +1576,14 @@ no_delegation: rcu_read_unlock(); if (!ret && open_stateid != NULL) { - __update_open_stateid(state, open_stateid, NULL, fmode); + __update_open_stateid(state, open_stateid, NULL, fmode, &freeme); ret = 1; } if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) - nfs4_schedule_state_manager(state->owner->so_server->nfs_client); + nfs4_schedule_state_manager(clp); + if (freeme.type != 0) + nfs4_test_and_free_stateid(server, &freeme, + state->owner->so_cred); return ret; } @@ -1889,7 +1960,6 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: set_bit(NFS_DELEGATED_STATE, &state->flags); - case -NFS4ERR_EXPIRED: /* Don't recall a delegation if it was lost */ nfs4_schedule_lease_recovery(server->nfs_client); return -EAGAIN; @@ -1901,6 +1971,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct return -EAGAIN; case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: case -NFS4ERR_OPENMODE: nfs_inode_find_state_and_recover(state->inode, @@ -2382,9 +2453,10 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta return ret; } -static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state) +static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state, + const nfs4_stateid *stateid) { - nfs_remove_bad_delegation(state->inode); + nfs_remove_bad_delegation(state->inode, stateid); write_seqlock(&state->seqlock); nfs4_stateid_copy(&state->stateid, &state->open_stateid); write_sequnlock(&state->seqlock); @@ -2394,7 +2466,7 @@ static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state) static void nfs40_clear_delegation_stateid(struct nfs4_state *state) { if (rcu_access_pointer(NFS_I(state->inode)->delegation) != NULL) - nfs_finish_clear_delegation_stateid(state); + nfs_finish_clear_delegation_stateid(state, NULL); } static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state) @@ -2404,7 +2476,45 @@ static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st return nfs4_open_expired(sp, state); } +static int nfs40_test_and_free_expired_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + struct rpc_cred *cred) +{ + return -NFS4ERR_BAD_STATEID; +} + #if defined(CONFIG_NFS_V4_1) +static int nfs41_test_and_free_expired_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + struct rpc_cred *cred) +{ + int status; + + switch (stateid->type) { + default: + break; + case NFS4_INVALID_STATEID_TYPE: + case NFS4_SPECIAL_STATEID_TYPE: + return -NFS4ERR_BAD_STATEID; + case NFS4_REVOKED_STATEID_TYPE: + goto out_free; + } + + status = nfs41_test_stateid(server, stateid, cred); + switch (status) { + case -NFS4ERR_EXPIRED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_DELEG_REVOKED: + break; + default: + return status; + } +out_free: + /* Ack the revoked state to the server */ + nfs41_free_stateid(server, stateid, cred, true); + return -NFS4ERR_EXPIRED; +} + static void nfs41_check_delegation_stateid(struct nfs4_state *state) { struct nfs_server *server = NFS_SERVER(state->inode); @@ -2422,23 +2532,68 @@ static void nfs41_check_delegation_stateid(struct nfs4_state *state) } nfs4_stateid_copy(&stateid, &delegation->stateid); + if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) { + rcu_read_unlock(); + nfs_finish_clear_delegation_stateid(state, &stateid); + return; + } + + if (!test_and_clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags)) { + rcu_read_unlock(); + return; + } + cred = get_rpccred(delegation->cred); rcu_read_unlock(); - status = nfs41_test_stateid(server, &stateid, cred); + status = nfs41_test_and_free_expired_stateid(server, &stateid, cred); trace_nfs4_test_delegation_stateid(state, NULL, status); - - if (status != NFS_OK) { - /* Free the stateid unless the server explicitly - * informs us the stateid is unrecognized. */ - if (status != -NFS4ERR_BAD_STATEID) - nfs41_free_stateid(server, &stateid, cred); - nfs_finish_clear_delegation_stateid(state); - } + if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID) + nfs_finish_clear_delegation_stateid(state, &stateid); put_rpccred(cred); } /** + * nfs41_check_expired_locks - possibly free a lock stateid + * + * @state: NFSv4 state for an inode + * + * Returns NFS_OK if recovery for this stateid is now finished. + * Otherwise a negative NFS4ERR value is returned. + */ +static int nfs41_check_expired_locks(struct nfs4_state *state) +{ + int status, ret = NFS_OK; + struct nfs4_lock_state *lsp; + struct nfs_server *server = NFS_SERVER(state->inode); + + if (!test_bit(LK_STATE_IN_USE, &state->flags)) + goto out; + list_for_each_entry(lsp, &state->lock_states, ls_locks) { + if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { + struct rpc_cred *cred = lsp->ls_state->owner->so_cred; + + status = nfs41_test_and_free_expired_stateid(server, + &lsp->ls_stateid, + cred); + trace_nfs4_test_lock_stateid(state, lsp, status); + if (status == -NFS4ERR_EXPIRED || + status == -NFS4ERR_BAD_STATEID) { + clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags); + lsp->ls_stateid.type = NFS4_INVALID_STATEID_TYPE; + if (!recover_lost_locks) + set_bit(NFS_LOCK_LOST, &lsp->ls_flags); + } else if (status != NFS_OK) { + ret = status; + break; + } + } + }; +out: + return ret; +} + +/** * nfs41_check_open_stateid - possibly free an open stateid * * @state: NFSv4 state for an inode @@ -2453,26 +2608,28 @@ static int nfs41_check_open_stateid(struct nfs4_state *state) struct rpc_cred *cred = state->owner->so_cred; int status; - /* If a state reset has been done, test_stateid is unneeded */ - if ((test_bit(NFS_O_RDONLY_STATE, &state->flags) == 0) && - (test_bit(NFS_O_WRONLY_STATE, &state->flags) == 0) && - (test_bit(NFS_O_RDWR_STATE, &state->flags) == 0)) + if (test_bit(NFS_OPEN_STATE, &state->flags) == 0) { + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) { + if (nfs4_have_delegation(state->inode, state->state)) + return NFS_OK; + return -NFS4ERR_OPENMODE; + } return -NFS4ERR_BAD_STATEID; - - status = nfs41_test_stateid(server, stateid, cred); + } + status = nfs41_test_and_free_expired_stateid(server, stateid, cred); trace_nfs4_test_open_stateid(state, NULL, status); - if (status != NFS_OK) { - /* Free the stateid unless the server explicitly - * informs us the stateid is unrecognized. */ - if (status != -NFS4ERR_BAD_STATEID) - nfs41_free_stateid(server, stateid, cred); - + if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID) { clear_bit(NFS_O_RDONLY_STATE, &state->flags); clear_bit(NFS_O_WRONLY_STATE, &state->flags); clear_bit(NFS_O_RDWR_STATE, &state->flags); clear_bit(NFS_OPEN_STATE, &state->flags); + stateid->type = NFS4_INVALID_STATEID_TYPE; } - return status; + if (status != NFS_OK) + return status; + if (nfs_open_stateid_recover_openmode(state)) + return -NFS4ERR_OPENMODE; + return NFS_OK; } static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state) @@ -2480,6 +2637,9 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st int status; nfs41_check_delegation_stateid(state); + status = nfs41_check_expired_locks(state); + if (status != NFS_OK) + return status; status = nfs41_check_open_stateid(state); if (status != NFS_OK) status = nfs4_open_expired(sp, state); @@ -2537,6 +2697,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, goto out; if (server->caps & NFS_CAP_POSIX_LOCK) set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); + if (opendata->o_res.rflags & NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK) + set_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags); dentry = opendata->dentry; if (d_really_is_negative(dentry)) { @@ -2899,9 +3061,12 @@ static void nfs4_close_done(struct rpc_task *task, void *data) break; case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_EXPIRED: + nfs4_free_revoked_stateid(server, + &calldata->arg.stateid, + task->tk_msg.rpc_cred); case -NFS4ERR_OLD_STATEID: case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_EXPIRED: if (!nfs4_stateid_match(&calldata->arg.stateid, &state->open_stateid)) { rpc_restart_call_prepare(task); @@ -4312,7 +4477,7 @@ static int nfs4_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, s if (error == 0) { /* block layout checks this! */ server->pnfs_blksize = fsinfo->blksize; - set_pnfs_layoutdriver(server, fhandle, fsinfo->layouttype); + set_pnfs_layoutdriver(server, fhandle, fsinfo); } return error; @@ -4399,24 +4564,25 @@ static bool nfs4_error_stateid_expired(int err) return false; } -void __nfs4_read_done_cb(struct nfs_pgio_header *hdr) -{ - nfs_invalidate_atime(hdr->inode); -} - static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { struct nfs_server *server = NFS_SERVER(hdr->inode); trace_nfs4_read(hdr, task->tk_status); - if (nfs4_async_handle_error(task, server, - hdr->args.context->state, - NULL) == -EAGAIN) { - rpc_restart_call_prepare(task); - return -EAGAIN; + if (task->tk_status < 0) { + struct nfs4_exception exception = { + .inode = hdr->inode, + .state = hdr->args.context->state, + .stateid = &hdr->args.stateid, + }; + task->tk_status = nfs4_async_handle_exception(task, + server, task->tk_status, &exception); + if (exception.retry) { + rpc_restart_call_prepare(task); + return -EAGAIN; + } } - __nfs4_read_done_cb(hdr); if (task->tk_status > 0) renew_lease(server, hdr->timestamp); return 0; @@ -4445,6 +4611,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) return -EAGAIN; if (nfs4_read_stateid_changed(task, &hdr->args)) return -EAGAIN; + if (task->tk_status > 0) + nfs_invalidate_atime(hdr->inode); return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) : nfs4_read_done_cb(task, hdr); } @@ -4482,11 +4650,19 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct inode *inode = hdr->inode; trace_nfs4_write(hdr, task->tk_status); - if (nfs4_async_handle_error(task, NFS_SERVER(inode), - hdr->args.context->state, - NULL) == -EAGAIN) { - rpc_restart_call_prepare(task); - return -EAGAIN; + if (task->tk_status < 0) { + struct nfs4_exception exception = { + .inode = hdr->inode, + .state = hdr->args.context->state, + .stateid = &hdr->args.stateid, + }; + task->tk_status = nfs4_async_handle_exception(task, + NFS_SERVER(inode), task->tk_status, + &exception); + if (exception.retry) { + rpc_restart_call_prepare(task); + return -EAGAIN; + } } if (task->tk_status >= 0) { renew_lease(NFS_SERVER(inode), hdr->timestamp); @@ -5123,12 +5299,14 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp, if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) { /* An impossible timestamp guarantees this value * will never match a generated boot time. */ - verf[0] = 0; - verf[1] = cpu_to_be32(NSEC_PER_SEC + 1); + verf[0] = cpu_to_be32(U32_MAX); + verf[1] = cpu_to_be32(U32_MAX); } else { struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); - verf[0] = cpu_to_be32(nn->boot_time.tv_sec); - verf[1] = cpu_to_be32(nn->boot_time.tv_nsec); + u64 ns = ktime_to_ns(nn->boot_time); + + verf[0] = cpu_to_be32(ns >> 32); + verf[1] = cpu_to_be32(ns); } memcpy(bootverf->data, verf, sizeof(bootverf->data)); } @@ -5393,10 +5571,13 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) renew_lease(data->res.server, data->timestamp); case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_EXPIRED: + nfs4_free_revoked_stateid(data->res.server, + data->args.stateid, + task->tk_msg.rpc_cred); case -NFS4ERR_BAD_STATEID: case -NFS4ERR_OLD_STATEID: case -NFS4ERR_STALE_STATEID: - case -NFS4ERR_EXPIRED: task->tk_status = 0; if (data->roc) pnfs_roc_set_barrier(data->inode, data->roc_barrier); @@ -5528,22 +5709,6 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4 return err; } -#define NFS4_LOCK_MINTIMEOUT (1 * HZ) -#define NFS4_LOCK_MAXTIMEOUT (30 * HZ) - -/* - * sleep, with exponential backoff, and retry the LOCK operation. - */ -static unsigned long -nfs4_set_lock_task_retry(unsigned long timeout) -{ - freezable_schedule_timeout_killable_unsafe(timeout); - timeout <<= 1; - if (timeout > NFS4_LOCK_MAXTIMEOUT) - return NFS4_LOCK_MAXTIMEOUT; - return timeout; -} - static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request) { struct inode *inode = state->inode; @@ -5600,11 +5765,6 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock * return err; } -static int do_vfs_lock(struct inode *inode, struct file_lock *fl) -{ - return locks_lock_inode_wait(inode, fl); -} - struct nfs4_unlockdata { struct nfs_locku_args arg; struct nfs_locku_res res; @@ -5657,14 +5817,18 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) switch (task->tk_status) { case 0: renew_lease(calldata->server, calldata->timestamp); - do_vfs_lock(calldata->lsp->ls_state->inode, &calldata->fl); + locks_lock_inode_wait(calldata->lsp->ls_state->inode, &calldata->fl); if (nfs4_update_lock_stateid(calldata->lsp, &calldata->res.stateid)) break; + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_EXPIRED: + nfs4_free_revoked_stateid(calldata->server, + &calldata->arg.stateid, + task->tk_msg.rpc_cred); case -NFS4ERR_BAD_STATEID: case -NFS4ERR_OLD_STATEID: case -NFS4ERR_STALE_STATEID: - case -NFS4ERR_EXPIRED: if (!nfs4_stateid_match(&calldata->arg.stateid, &calldata->lsp->ls_stateid)) rpc_restart_call_prepare(task); @@ -5765,7 +5929,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * mutex_lock(&sp->so_delegreturn_mutex); /* Exclude nfs4_reclaim_open_stateid() - note nesting! */ down_read(&nfsi->rwsem); - if (do_vfs_lock(inode, request) == -ENOENT) { + if (locks_lock_inode_wait(inode, request) == -ENOENT) { up_read(&nfsi->rwsem); mutex_unlock(&sp->so_delegreturn_mutex); goto out; @@ -5906,7 +6070,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) data->timestamp); if (data->arg.new_lock) { data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS); - if (do_vfs_lock(lsp->ls_state->inode, &data->fl) < 0) { + if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0) { rpc_restart_call_prepare(task); break; } @@ -5965,6 +6129,7 @@ static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_ { switch (error) { case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; if (new_lock_owner != 0 || @@ -5973,7 +6138,6 @@ static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_ break; case -NFS4ERR_STALE_STATEID: lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; - case -NFS4ERR_EXPIRED: nfs4_schedule_lease_recovery(server->nfs_client); }; } @@ -6083,52 +6247,19 @@ out: } #if defined(CONFIG_NFS_V4_1) -/** - * nfs41_check_expired_locks - possibly free a lock stateid - * - * @state: NFSv4 state for an inode - * - * Returns NFS_OK if recovery for this stateid is now finished. - * Otherwise a negative NFS4ERR value is returned. - */ -static int nfs41_check_expired_locks(struct nfs4_state *state) -{ - int status, ret = -NFS4ERR_BAD_STATEID; - struct nfs4_lock_state *lsp; - struct nfs_server *server = NFS_SERVER(state->inode); - - list_for_each_entry(lsp, &state->lock_states, ls_locks) { - if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { - struct rpc_cred *cred = lsp->ls_state->owner->so_cred; - - status = nfs41_test_stateid(server, - &lsp->ls_stateid, - cred); - trace_nfs4_test_lock_stateid(state, lsp, status); - if (status != NFS_OK) { - /* Free the stateid unless the server - * informs us the stateid is unrecognized. */ - if (status != -NFS4ERR_BAD_STATEID) - nfs41_free_stateid(server, - &lsp->ls_stateid, - cred); - clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags); - ret = status; - } - } - }; - - return ret; -} - static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request) { - int status = NFS_OK; + struct nfs4_lock_state *lsp; + int status; - if (test_bit(LK_STATE_IN_USE, &state->flags)) - status = nfs41_check_expired_locks(state); - if (status != NFS_OK) - status = nfs4_lock_expired(state, request); + status = nfs4_set_lock_state(state, request); + if (status != 0) + return status; + lsp = request->fl_u.nfs4_fl.owner; + if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) || + test_bit(NFS_LOCK_LOST, &lsp->ls_flags)) + return 0; + status = nfs4_lock_expired(state, request); return status; } #endif @@ -6138,17 +6269,10 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs4_state_owner *sp = state->owner; unsigned char fl_flags = request->fl_flags; - int status = -ENOLCK; + int status; - if ((fl_flags & FL_POSIX) && - !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags)) - goto out; - /* Is this a delegated open? */ - status = nfs4_set_lock_state(state, request); - if (status != 0) - goto out; request->fl_flags |= FL_ACCESS; - status = do_vfs_lock(state->inode, request); + status = locks_lock_inode_wait(state->inode, request); if (status < 0) goto out; mutex_lock(&sp->so_delegreturn_mutex); @@ -6157,7 +6281,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock /* Yes: cache locks! */ /* ...but avoid races with delegation recall... */ request->fl_flags = fl_flags & ~FL_SLEEP; - status = do_vfs_lock(state->inode, request); + status = locks_lock_inode_wait(state->inode, request); up_read(&nfsi->rwsem); mutex_unlock(&sp->so_delegreturn_mutex); goto out; @@ -6188,12 +6312,124 @@ static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock * return err; } +#define NFS4_LOCK_MINTIMEOUT (1 * HZ) +#define NFS4_LOCK_MAXTIMEOUT (30 * HZ) + +static int +nfs4_retry_setlk_simple(struct nfs4_state *state, int cmd, + struct file_lock *request) +{ + int status = -ERESTARTSYS; + unsigned long timeout = NFS4_LOCK_MINTIMEOUT; + + while(!signalled()) { + status = nfs4_proc_setlk(state, cmd, request); + if ((status != -EAGAIN) || IS_SETLK(cmd)) + break; + freezable_schedule_timeout_interruptible(timeout); + timeout *= 2; + timeout = min_t(unsigned long, NFS4_LOCK_MAXTIMEOUT, timeout); + status = -ERESTARTSYS; + } + return status; +} + +#ifdef CONFIG_NFS_V4_1 +struct nfs4_lock_waiter { + struct task_struct *task; + struct inode *inode; + struct nfs_lowner *owner; + bool notified; +}; + +static int +nfs4_wake_lock_waiter(wait_queue_t *wait, unsigned int mode, int flags, void *key) +{ + int ret; + struct cb_notify_lock_args *cbnl = key; + struct nfs4_lock_waiter *waiter = wait->private; + struct nfs_lowner *lowner = &cbnl->cbnl_owner, + *wowner = waiter->owner; + + /* Only wake if the callback was for the same owner */ + if (lowner->clientid != wowner->clientid || + lowner->id != wowner->id || + lowner->s_dev != wowner->s_dev) + return 0; + + /* Make sure it's for the right inode */ + if (nfs_compare_fh(NFS_FH(waiter->inode), &cbnl->cbnl_fh)) + return 0; + + waiter->notified = true; + + /* override "private" so we can use default_wake_function */ + wait->private = waiter->task; + ret = autoremove_wake_function(wait, mode, flags, key); + wait->private = waiter; + return ret; +} + +static int +nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) +{ + int status = -ERESTARTSYS; + unsigned long flags; + struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner; + struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs_client *clp = server->nfs_client; + wait_queue_head_t *q = &clp->cl_lock_waitq; + struct nfs_lowner owner = { .clientid = clp->cl_clientid, + .id = lsp->ls_seqid.owner_id, + .s_dev = server->s_dev }; + struct nfs4_lock_waiter waiter = { .task = current, + .inode = state->inode, + .owner = &owner, + .notified = false }; + wait_queue_t wait; + + /* Don't bother with waitqueue if we don't expect a callback */ + if (!test_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags)) + return nfs4_retry_setlk_simple(state, cmd, request); + + init_wait(&wait); + wait.private = &waiter; + wait.func = nfs4_wake_lock_waiter; + add_wait_queue(q, &wait); + + while(!signalled()) { + status = nfs4_proc_setlk(state, cmd, request); + if ((status != -EAGAIN) || IS_SETLK(cmd)) + break; + + status = -ERESTARTSYS; + spin_lock_irqsave(&q->lock, flags); + if (waiter.notified) { + spin_unlock_irqrestore(&q->lock, flags); + continue; + } + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&q->lock, flags); + + freezable_schedule_timeout_interruptible(NFS4_LOCK_MAXTIMEOUT); + } + + finish_wait(q, &wait); + return status; +} +#else /* !CONFIG_NFS_V4_1 */ +static inline int +nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) +{ + return nfs4_retry_setlk_simple(state, cmd, request); +} +#endif + static int nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) { struct nfs_open_context *ctx; struct nfs4_state *state; - unsigned long timeout = NFS4_LOCK_MINTIMEOUT; int status; /* verify open state */ @@ -6220,6 +6456,11 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) if (state == NULL) return -ENOLCK; + + if ((request->fl_flags & FL_POSIX) && + !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags)) + return -ENOLCK; + /* * Don't rely on the VFS having checked the file open mode, * since it won't do this for flock() locks. @@ -6234,16 +6475,11 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) return -EBADF; } - do { - status = nfs4_proc_setlk(state, cmd, request); - if ((status != -EAGAIN) || IS_SETLK(cmd)) - break; - timeout = nfs4_set_lock_task_retry(timeout); - status = -ERESTARTSYS; - if (signalled()) - break; - } while(status < 0); - return status; + status = nfs4_set_lock_state(state, request); + if (status != 0) + return status; + + return nfs4_retry_setlk(state, cmd, request); } int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid) @@ -7104,75 +7340,161 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp, return 0; } +struct nfs41_exchange_id_data { + struct nfs41_exchange_id_res res; + struct nfs41_exchange_id_args args; + struct rpc_xprt *xprt; + int rpc_status; +}; + +static void nfs4_exchange_id_done(struct rpc_task *task, void *data) +{ + struct nfs41_exchange_id_data *cdata = + (struct nfs41_exchange_id_data *)data; + struct nfs_client *clp = cdata->args.client; + int status = task->tk_status; + + trace_nfs4_exchange_id(clp, status); + + if (status == 0) + status = nfs4_check_cl_exchange_flags(cdata->res.flags); + + if (cdata->xprt && status == 0) { + status = nfs4_detect_session_trunking(clp, &cdata->res, + cdata->xprt); + goto out; + } + + if (status == 0) + status = nfs4_sp4_select_mode(clp, &cdata->res.state_protect); + + if (status == 0) { + clp->cl_clientid = cdata->res.clientid; + clp->cl_exchange_flags = cdata->res.flags; + /* Client ID is not confirmed */ + if (!(cdata->res.flags & EXCHGID4_FLAG_CONFIRMED_R)) { + clear_bit(NFS4_SESSION_ESTABLISHED, + &clp->cl_session->session_state); + clp->cl_seqid = cdata->res.seqid; + } + + kfree(clp->cl_serverowner); + clp->cl_serverowner = cdata->res.server_owner; + cdata->res.server_owner = NULL; + + /* use the most recent implementation id */ + kfree(clp->cl_implid); + clp->cl_implid = cdata->res.impl_id; + cdata->res.impl_id = NULL; + + if (clp->cl_serverscope != NULL && + !nfs41_same_server_scope(clp->cl_serverscope, + cdata->res.server_scope)) { + dprintk("%s: server_scope mismatch detected\n", + __func__); + set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state); + kfree(clp->cl_serverscope); + clp->cl_serverscope = NULL; + } + + if (clp->cl_serverscope == NULL) { + clp->cl_serverscope = cdata->res.server_scope; + cdata->res.server_scope = NULL; + } + /* Save the EXCHANGE_ID verifier session trunk tests */ + memcpy(clp->cl_confirm.data, cdata->args.verifier->data, + sizeof(clp->cl_confirm.data)); + } +out: + cdata->rpc_status = status; + return; +} + +static void nfs4_exchange_id_release(void *data) +{ + struct nfs41_exchange_id_data *cdata = + (struct nfs41_exchange_id_data *)data; + + nfs_put_client(cdata->args.client); + if (cdata->xprt) { + xprt_put(cdata->xprt); + rpc_clnt_xprt_switch_put(cdata->args.client->cl_rpcclient); + } + kfree(cdata->res.impl_id); + kfree(cdata->res.server_scope); + kfree(cdata->res.server_owner); + kfree(cdata); +} + +static const struct rpc_call_ops nfs4_exchange_id_call_ops = { + .rpc_call_done = nfs4_exchange_id_done, + .rpc_release = nfs4_exchange_id_release, +}; + /* * _nfs4_proc_exchange_id() * * Wrapper for EXCHANGE_ID operation. */ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, - u32 sp4_how) + u32 sp4_how, struct rpc_xprt *xprt) { nfs4_verifier verifier; - struct nfs41_exchange_id_args args = { - .verifier = &verifier, - .client = clp, -#ifdef CONFIG_NFS_V4_1_MIGRATION - .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | - EXCHGID4_FLAG_BIND_PRINC_STATEID | - EXCHGID4_FLAG_SUPP_MOVED_MIGR, -#else - .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | - EXCHGID4_FLAG_BIND_PRINC_STATEID, -#endif - }; - struct nfs41_exchange_id_res res = { - 0 - }; - int status; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_EXCHANGE_ID], - .rpc_argp = &args, - .rpc_resp = &res, .rpc_cred = cred, }; + struct rpc_task_setup task_setup_data = { + .rpc_client = clp->cl_rpcclient, + .callback_ops = &nfs4_exchange_id_call_ops, + .rpc_message = &msg, + .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT, + }; + struct nfs41_exchange_id_data *calldata; + struct rpc_task *task; + int status = -EIO; + + if (!atomic_inc_not_zero(&clp->cl_count)) + goto out; + + status = -ENOMEM; + calldata = kzalloc(sizeof(*calldata), GFP_NOFS); + if (!calldata) + goto out; - nfs4_init_boot_verifier(clp, &verifier); + if (!xprt) + nfs4_init_boot_verifier(clp, &verifier); status = nfs4_init_uniform_client_string(clp); if (status) - goto out; + goto out_calldata; dprintk("NFS call exchange_id auth=%s, '%s'\n", clp->cl_rpcclient->cl_auth->au_ops->au_name, clp->cl_owner_id); - res.server_owner = kzalloc(sizeof(struct nfs41_server_owner), - GFP_NOFS); - if (unlikely(res.server_owner == NULL)) { - status = -ENOMEM; - goto out; - } + calldata->res.server_owner = kzalloc(sizeof(struct nfs41_server_owner), + GFP_NOFS); + status = -ENOMEM; + if (unlikely(calldata->res.server_owner == NULL)) + goto out_calldata; - res.server_scope = kzalloc(sizeof(struct nfs41_server_scope), + calldata->res.server_scope = kzalloc(sizeof(struct nfs41_server_scope), GFP_NOFS); - if (unlikely(res.server_scope == NULL)) { - status = -ENOMEM; + if (unlikely(calldata->res.server_scope == NULL)) goto out_server_owner; - } - res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_NOFS); - if (unlikely(res.impl_id == NULL)) { - status = -ENOMEM; + calldata->res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_NOFS); + if (unlikely(calldata->res.impl_id == NULL)) goto out_server_scope; - } switch (sp4_how) { case SP4_NONE: - args.state_protect.how = SP4_NONE; + calldata->args.state_protect.how = SP4_NONE; break; case SP4_MACH_CRED: - args.state_protect = nfs4_sp4_mach_cred_request; + calldata->args.state_protect = nfs4_sp4_mach_cred_request; break; default: @@ -7181,56 +7503,42 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, status = -EINVAL; goto out_impl_id; } + if (xprt) { + calldata->xprt = xprt; + task_setup_data.rpc_xprt = xprt; + task_setup_data.flags = + RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC; + calldata->args.verifier = &clp->cl_confirm; + } else { + calldata->args.verifier = &verifier; + } + calldata->args.client = clp; +#ifdef CONFIG_NFS_V4_1_MIGRATION + calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | + EXCHGID4_FLAG_BIND_PRINC_STATEID | + EXCHGID4_FLAG_SUPP_MOVED_MIGR, +#else + calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | + EXCHGID4_FLAG_BIND_PRINC_STATEID, +#endif + msg.rpc_argp = &calldata->args; + msg.rpc_resp = &calldata->res; + task_setup_data.callback_data = calldata; - status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); - trace_nfs4_exchange_id(clp, status); - if (status == 0) - status = nfs4_check_cl_exchange_flags(res.flags); - - if (status == 0) - status = nfs4_sp4_select_mode(clp, &res.state_protect); - - if (status == 0) { - clp->cl_clientid = res.clientid; - clp->cl_exchange_flags = res.flags; - /* Client ID is not confirmed */ - if (!(res.flags & EXCHGID4_FLAG_CONFIRMED_R)) { - clear_bit(NFS4_SESSION_ESTABLISHED, - &clp->cl_session->session_state); - clp->cl_seqid = res.seqid; - } - - kfree(clp->cl_serverowner); - clp->cl_serverowner = res.server_owner; - res.server_owner = NULL; - - /* use the most recent implementation id */ - kfree(clp->cl_implid); - clp->cl_implid = res.impl_id; - res.impl_id = NULL; - - if (clp->cl_serverscope != NULL && - !nfs41_same_server_scope(clp->cl_serverscope, - res.server_scope)) { - dprintk("%s: server_scope mismatch detected\n", - __func__); - set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state); - kfree(clp->cl_serverscope); - clp->cl_serverscope = NULL; - } - - if (clp->cl_serverscope == NULL) { - clp->cl_serverscope = res.server_scope; - res.server_scope = NULL; - } + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) { + status = PTR_ERR(task); + goto out_impl_id; } -out_impl_id: - kfree(res.impl_id); -out_server_scope: - kfree(res.server_scope); -out_server_owner: - kfree(res.server_owner); + if (!xprt) { + status = rpc_wait_for_completion_task(task); + if (!status) + status = calldata->rpc_status; + } else /* session trunking test */ + status = calldata->rpc_status; + + rpc_put_task(task); out: if (clp->cl_implid != NULL) dprintk("NFS reply exchange_id: Server Implementation ID: " @@ -7240,6 +7548,16 @@ out: clp->cl_implid->date.nseconds); dprintk("NFS reply exchange_id: %d\n", status); return status; + +out_impl_id: + kfree(calldata->res.impl_id); +out_server_scope: + kfree(calldata->res.server_scope); +out_server_owner: + kfree(calldata->res.server_owner); +out_calldata: + kfree(calldata); + goto out; } /* @@ -7262,14 +7580,45 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) /* try SP4_MACH_CRED if krb5i/p */ if (authflavor == RPC_AUTH_GSS_KRB5I || authflavor == RPC_AUTH_GSS_KRB5P) { - status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED); + status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED, NULL); if (!status) return 0; } /* try SP4_NONE */ - return _nfs4_proc_exchange_id(clp, cred, SP4_NONE); + return _nfs4_proc_exchange_id(clp, cred, SP4_NONE, NULL); +} + +/** + * nfs4_test_session_trunk + * + * This is an add_xprt_test() test function called from + * rpc_clnt_setup_test_and_add_xprt. + * + * The rpc_xprt_switch is referrenced by rpc_clnt_setup_test_and_add_xprt + * and is dereferrenced in nfs4_exchange_id_release + * + * Upon success, add the new transport to the rpc_clnt + * + * @clnt: struct rpc_clnt to get new transport + * @xprt: the rpc_xprt to test + * @data: call data for _nfs4_proc_exchange_id. + */ +int nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt, + void *data) +{ + struct nfs4_add_xprt_data *adata = (struct nfs4_add_xprt_data *)data; + u32 sp4_how; + + dprintk("--> %s try %s\n", __func__, + xprt->address_strings[RPC_DISPLAY_ADDR]); + + sp4_how = (adata->clp->cl_sp4_flags == 0 ? SP4_NONE : SP4_MACH_CRED); + + /* Test connection for session trunking. Async exchange_id call */ + return _nfs4_proc_exchange_id(adata->clp, adata->cred, sp4_how, xprt); } +EXPORT_SYMBOL_GPL(nfs4_test_session_trunk); static int _nfs4_proc_destroy_clientid(struct nfs_client *clp, struct rpc_cred *cred) @@ -7463,7 +7812,7 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args, args->bc_attrs.max_resp_sz = max_bc_payload; args->bc_attrs.max_resp_sz_cached = 0; args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS; - args->bc_attrs.max_reqs = NFS41_BC_MAX_CALLBACKS; + args->bc_attrs.max_reqs = min_t(unsigned short, max_session_cb_slots, 1); dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u " "max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n", @@ -7510,10 +7859,9 @@ static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args return -EINVAL; if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached) return -EINVAL; - /* These would render the backchannel useless: */ - if (rcvd->max_ops != sent->max_ops) + if (rcvd->max_ops > sent->max_ops) return -EINVAL; - if (rcvd->max_reqs != sent->max_reqs) + if (rcvd->max_reqs > sent->max_reqs) return -EINVAL; out: return 0; @@ -7982,6 +8330,8 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, case -NFS4ERR_RECALLCONFLICT: status = -ERECALLCONFLICT; break; + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: exception->timeout = 0; @@ -7993,6 +8343,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, &lgp->args.ctx->state->stateid)) { spin_unlock(&inode->i_lock); exception->state = lgp->args.ctx->state; + exception->stateid = &lgp->args.stateid; break; } @@ -8591,6 +8942,24 @@ static int _nfs41_test_stateid(struct nfs_server *server, return -res.status; } +static void nfs4_handle_delay_or_session_error(struct nfs_server *server, + int err, struct nfs4_exception *exception) +{ + exception->retry = 0; + switch(err) { + case -NFS4ERR_DELAY: + case -NFS4ERR_RETRY_UNCACHED_REP: + nfs4_handle_exception(server, err, exception); + break; + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_DEADSESSION: + nfs4_do_handle_exception(server, err, exception); + } +} + /** * nfs41_test_stateid - perform a TEST_STATEID operation * @@ -8610,9 +8979,7 @@ static int nfs41_test_stateid(struct nfs_server *server, int err; do { err = _nfs41_test_stateid(server, stateid, cred); - if (err != -NFS4ERR_DELAY) - break; - nfs4_handle_exception(server, err, &exception); + nfs4_handle_delay_or_session_error(server, err, &exception); } while (exception.retry); return err; } @@ -8657,7 +9024,7 @@ static const struct rpc_call_ops nfs41_free_stateid_ops = { }; static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, - nfs4_stateid *stateid, + const nfs4_stateid *stateid, struct rpc_cred *cred, bool privileged) { @@ -8687,7 +9054,7 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, msg.rpc_argp = &data->args; msg.rpc_resp = &data->res; - nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); if (privileged) nfs4_set_sequence_privileged(&data->args.seq_args); @@ -8700,38 +9067,31 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, * @server: server / transport on which to perform the operation * @stateid: state ID to release * @cred: credential + * @is_recovery: set to true if this call needs to be privileged * - * Returns NFS_OK if the server freed "stateid". Otherwise a - * negative NFS4ERR value is returned. + * Note: this function is always asynchronous. */ static int nfs41_free_stateid(struct nfs_server *server, - nfs4_stateid *stateid, - struct rpc_cred *cred) + const nfs4_stateid *stateid, + struct rpc_cred *cred, + bool is_recovery) { struct rpc_task *task; - int ret; - task = _nfs41_free_stateid(server, stateid, cred, true); + task = _nfs41_free_stateid(server, stateid, cred, is_recovery); if (IS_ERR(task)) return PTR_ERR(task); - ret = rpc_wait_for_completion_task(task); - if (!ret) - ret = task->tk_status; rpc_put_task(task); - return ret; + return 0; } static void nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) { - struct rpc_task *task; struct rpc_cred *cred = lsp->ls_state->owner->so_cred; - task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false); + nfs41_free_stateid(server, &lsp->ls_stateid, cred, false); nfs4_free_lock_state(server, lsp); - if (IS_ERR(task)) - return; - rpc_put_task(task); } static bool nfs41_match_stateid(const nfs4_stateid *s1, @@ -8835,6 +9195,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { .match_stateid = nfs4_match_stateid, .find_root_sec = nfs4_find_root_sec, .free_lock_state = nfs4_release_lockowner, + .test_and_free_expired = nfs40_test_and_free_expired_stateid, .alloc_seqid = nfs_alloc_seqid, .call_sync_ops = &nfs40_call_sync_ops, .reboot_recovery_ops = &nfs40_reboot_recovery_ops, @@ -8862,7 +9223,9 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { .match_stateid = nfs41_match_stateid, .find_root_sec = nfs41_find_root_sec, .free_lock_state = nfs41_free_lock_state, + .test_and_free_expired = nfs41_test_and_free_expired_stateid, .alloc_seqid = nfs_alloc_no_seqid, + .session_trunk = nfs4_test_session_trunk, .call_sync_ops = &nfs41_call_sync_ops, .reboot_recovery_ops = &nfs41_reboot_recovery_ops, .nograce_recovery_ops = &nfs41_nograce_recovery_ops, @@ -8891,7 +9254,9 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { .find_root_sec = nfs41_find_root_sec, .free_lock_state = nfs41_free_lock_state, .call_sync_ops = &nfs41_call_sync_ops, + .test_and_free_expired = nfs41_test_and_free_expired_stateid, .alloc_seqid = nfs_alloc_no_seqid, + .session_trunk = nfs4_test_session_trunk, .reboot_recovery_ops = &nfs41_reboot_recovery_ops, .nograce_recovery_ops = &nfs41_nograce_recovery_ops, .state_renewal_ops = &nfs41_state_renewal_ops, diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index f703b755351b..dae385500005 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -9,6 +9,7 @@ /* maximum number of slots to use */ #define NFS4_DEF_SLOT_TABLE_SIZE (64U) +#define NFS4_DEF_CB_SLOT_TABLE_SIZE (1U) #define NFS4_MAX_SLOT_TABLE (1024U) #define NFS4_NO_SLOT ((u32)-1) @@ -22,6 +23,7 @@ struct nfs4_slot { u32 slot_nr; u32 seq_nr; unsigned int interrupted : 1, + privileged : 1, seq_done : 1; }; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index cada00aa5096..5f4281ec5f72 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -991,6 +991,8 @@ int nfs4_select_rw_stateid(struct nfs4_state *state, { int ret; + if (!nfs4_valid_open_stateid(state)) + return -EIO; if (cred != NULL) *cred = NULL; ret = nfs4_copy_lock_stateid(dst, state, lockowner); @@ -1303,6 +1305,8 @@ void nfs4_schedule_path_down_recovery(struct nfs_client *clp) static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state) { + if (!nfs4_valid_open_stateid(state)) + return 0; set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); /* Don't recover state that expired before the reboot */ if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) { @@ -1316,6 +1320,8 @@ static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_st int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state) { + if (!nfs4_valid_open_stateid(state)) + return 0; set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags); clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); set_bit(NFS_OWNER_RECLAIM_NOGRACE, &state->owner->so_flags); @@ -1327,9 +1333,8 @@ int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_ { struct nfs_client *clp = server->nfs_client; - if (!nfs4_valid_open_stateid(state)) + if (!nfs4_state_mark_reclaim_nograce(clp, state)) return -EBADF; - nfs4_state_mark_reclaim_nograce(clp, state); dprintk("%s: scheduling stateid recovery for server %s\n", __func__, clp->cl_hostname); nfs4_schedule_state_manager(clp); @@ -1337,6 +1342,35 @@ int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_ } EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery); +static struct nfs4_lock_state * +nfs_state_find_lock_state_by_stateid(struct nfs4_state *state, + const nfs4_stateid *stateid) +{ + struct nfs4_lock_state *pos; + + list_for_each_entry(pos, &state->lock_states, ls_locks) { + if (!test_bit(NFS_LOCK_INITIALIZED, &pos->ls_flags)) + continue; + if (nfs4_stateid_match_other(&pos->ls_stateid, stateid)) + return pos; + } + return NULL; +} + +static bool nfs_state_lock_state_matches_stateid(struct nfs4_state *state, + const nfs4_stateid *stateid) +{ + bool found = false; + + if (test_bit(LK_STATE_IN_USE, &state->flags)) { + spin_lock(&state->state_lock); + if (nfs_state_find_lock_state_by_stateid(state, stateid)) + found = true; + spin_unlock(&state->state_lock); + } + return found; +} + void nfs_inode_find_state_and_recover(struct inode *inode, const nfs4_stateid *stateid) { @@ -1351,14 +1385,18 @@ void nfs_inode_find_state_and_recover(struct inode *inode, state = ctx->state; if (state == NULL) continue; - if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) + if (nfs4_stateid_match_other(&state->stateid, stateid) && + nfs4_state_mark_reclaim_nograce(clp, state)) { + found = true; continue; - if (!nfs4_stateid_match(&state->stateid, stateid)) - continue; - nfs4_state_mark_reclaim_nograce(clp, state); - found = true; + } + if (nfs_state_lock_state_matches_stateid(state, stateid) && + nfs4_state_mark_reclaim_nograce(clp, state)) + found = true; } spin_unlock(&inode->i_lock); + + nfs_inode_find_delegation_state_and_recover(inode, stateid); if (found) nfs4_schedule_state_manager(clp); } @@ -1498,6 +1536,9 @@ restart: __func__, status); case -ENOENT: case -ENOMEM: + case -EACCES: + case -EROFS: + case -EIO: case -ESTALE: /* Open state on this file cannot be recovered */ nfs4_state_mark_recovery_failed(state, status); @@ -1656,15 +1697,9 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) put_rpccred(cred); } -static void nfs_delegation_clear_all(struct nfs_client *clp) -{ - nfs_delegation_mark_reclaim(clp); - nfs_delegation_reap_unclaimed(clp); -} - static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp) { - nfs_delegation_clear_all(clp); + nfs_mark_test_expired_all_delegations(clp); nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce); } @@ -2195,7 +2230,7 @@ static void nfs41_handle_all_state_revoked(struct nfs_client *clp) static void nfs41_handle_some_state_revoked(struct nfs_client *clp) { - nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce); + nfs4_state_start_reclaim_nograce(clp); nfs4_schedule_state_manager(clp); dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname); @@ -2227,13 +2262,22 @@ static void nfs41_handle_cb_path_down(struct nfs_client *clp) nfs4_schedule_state_manager(clp); } -void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) +void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags, + bool recovery) { if (!flags) return; dprintk("%s: \"%s\" (client ID %llx) flags=0x%08x\n", __func__, clp->cl_hostname, clp->cl_clientid, flags); + /* + * If we're called from the state manager thread, then assume we're + * already handling the RECLAIM_NEEDED and/or STATE_REVOKED. + * Those flags are expected to remain set until we're done + * recovering (see RFC5661, section 18.46.3). + */ + if (recovery) + goto out_recovery; if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) nfs41_handle_server_reboot(clp); @@ -2246,6 +2290,7 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) nfs4_schedule_lease_moved_recovery(clp); if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED) nfs41_handle_recallable_state_revoked(clp); +out_recovery: if (flags & SEQ4_STATUS_BACKCHANNEL_FAULT) nfs41_handle_backchannel_fault(clp); else if (flags & (SEQ4_STATUS_CB_PATH_DOWN | @@ -2410,6 +2455,13 @@ static void nfs4_state_manager(struct nfs_client *clp) nfs4_state_end_reclaim_reboot(clp); } + /* Detect expired delegations... */ + if (test_and_clear_bit(NFS4CLNT_DELEGATION_EXPIRED, &clp->cl_state)) { + section = "detect expired delegations"; + nfs_reap_expired_delegations(clp); + continue; + } + /* Now recover expired state... */ if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { section = "reclaim nograce"; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 7bd3a5c09d31..fc89e5ed07ee 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1850,7 +1850,7 @@ static void encode_create_session(struct xdr_stream *xdr, *p++ = cpu_to_be32(RPC_AUTH_UNIX); /* auth_sys */ /* authsys_parms rfc1831 */ - *p++ = cpu_to_be32(nn->boot_time.tv_nsec); /* stamp */ + *p++ = cpu_to_be32(ktime_to_ns(nn->boot_time)); /* stamp */ p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen); *p++ = cpu_to_be32(0); /* UID */ *p++ = cpu_to_be32(0); /* GID */ @@ -4725,34 +4725,37 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, } /* - * Decode potentially multiple layout types. Currently we only support - * one layout driver per file system. + * Decode potentially multiple layout types. */ -static int decode_first_pnfs_layout_type(struct xdr_stream *xdr, - uint32_t *layouttype) +static int decode_pnfs_layout_types(struct xdr_stream *xdr, + struct nfs_fsinfo *fsinfo) { __be32 *p; - int num; + uint32_t i; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - num = be32_to_cpup(p); + fsinfo->nlayouttypes = be32_to_cpup(p); /* pNFS is not supported by the underlying file system */ - if (num == 0) { - *layouttype = 0; + if (fsinfo->nlayouttypes == 0) return 0; - } - if (num > 1) - printk(KERN_INFO "NFS: %s: Warning: Multiple pNFS layout " - "drivers per filesystem not supported\n", __func__); /* Decode and set first layout type, move xdr->p past unused types */ - p = xdr_inline_decode(xdr, num * 4); + p = xdr_inline_decode(xdr, fsinfo->nlayouttypes * 4); if (unlikely(!p)) goto out_overflow; - *layouttype = be32_to_cpup(p); + + /* If we get too many, then just cap it at the max */ + if (fsinfo->nlayouttypes > NFS_MAX_LAYOUT_TYPES) { + printk(KERN_INFO "NFS: %s: Warning: Too many (%u) pNFS layout types\n", + __func__, fsinfo->nlayouttypes); + fsinfo->nlayouttypes = NFS_MAX_LAYOUT_TYPES; + } + + for(i = 0; i < fsinfo->nlayouttypes; ++i) + fsinfo->layouttype[i] = be32_to_cpup(p++); return 0; out_overflow: print_overflow_msg(__func__, xdr); @@ -4764,7 +4767,7 @@ out_overflow: * Note we must ensure that layouttype is set in any non-error case. */ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, - uint32_t *layouttype) + struct nfs_fsinfo *fsinfo) { int status = 0; @@ -4772,10 +4775,9 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U))) return -EIO; if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) { - status = decode_first_pnfs_layout_type(xdr, layouttype); + status = decode_pnfs_layout_types(xdr, fsinfo); bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES; - } else - *layouttype = 0; + } return status; } @@ -4856,7 +4858,7 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta); if (status != 0) goto xdr_error; - status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype); + status = decode_attr_pnfstype(xdr, bitmap, fsinfo); if (status != 0) goto xdr_error; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 2c93a85eda51..56b2d96f9103 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -30,6 +30,7 @@ #include <linux/nfs_fs.h> #include <linux/nfs_page.h> #include <linux/module.h> +#include <linux/sort.h> #include "internal.h" #include "pnfs.h" #include "iostat.h" @@ -99,35 +100,79 @@ unset_pnfs_layoutdriver(struct nfs_server *nfss) } /* + * When the server sends a list of layout types, we choose one in the order + * given in the list below. + * + * FIXME: should this list be configurable in some fashion? module param? + * mount option? something else? + */ +static const u32 ld_prefs[] = { + LAYOUT_SCSI, + LAYOUT_BLOCK_VOLUME, + LAYOUT_OSD2_OBJECTS, + LAYOUT_FLEX_FILES, + LAYOUT_NFSV4_1_FILES, + 0 +}; + +static int +ld_cmp(const void *e1, const void *e2) +{ + u32 ld1 = *((u32 *)e1); + u32 ld2 = *((u32 *)e2); + int i; + + for (i = 0; ld_prefs[i] != 0; i++) { + if (ld1 == ld_prefs[i]) + return -1; + + if (ld2 == ld_prefs[i]) + return 1; + } + return 0; +} + +/* * Try to set the server's pnfs module to the pnfs layout type specified by id. * Currently only one pNFS layout driver per filesystem is supported. * - * @id layout type. Zero (illegal layout type) indicates pNFS not in use. + * @ids array of layout types supported by MDS. */ void set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, - u32 id) + struct nfs_fsinfo *fsinfo) { struct pnfs_layoutdriver_type *ld_type = NULL; + u32 id; + int i; - if (id == 0) - goto out_no_driver; if (!(server->nfs_client->cl_exchange_flags & (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) { - printk(KERN_ERR "NFS: %s: id %u cl_exchange_flags 0x%x\n", - __func__, id, server->nfs_client->cl_exchange_flags); + printk(KERN_ERR "NFS: %s: cl_exchange_flags 0x%x\n", + __func__, server->nfs_client->cl_exchange_flags); goto out_no_driver; } - ld_type = find_pnfs_driver(id); - if (!ld_type) { - request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id); + + sort(fsinfo->layouttype, fsinfo->nlayouttypes, + sizeof(*fsinfo->layouttype), ld_cmp, NULL); + + for (i = 0; i < fsinfo->nlayouttypes; i++) { + id = fsinfo->layouttype[i]; ld_type = find_pnfs_driver(id); if (!ld_type) { - dprintk("%s: No pNFS module found for %u.\n", - __func__, id); - goto out_no_driver; + request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, + id); + ld_type = find_pnfs_driver(id); } + if (ld_type) + break; + } + + if (!ld_type) { + dprintk("%s: No pNFS module found!\n", __func__); + goto out_no_driver; } + server->pnfs_curr_ld = ld_type; if (ld_type->set_layoutdriver && ld_type->set_layoutdriver(server, mntfh)) { @@ -2185,10 +2230,8 @@ static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr) */ void pnfs_ld_read_done(struct nfs_pgio_header *hdr) { - if (likely(!hdr->pnfs_error)) { - __nfs4_read_done_cb(hdr); + if (likely(!hdr->pnfs_error)) hdr->mds_ops->rpc_call_done(&hdr->task, hdr); - } trace_nfs4_pnfs_read(hdr, hdr->pnfs_error); if (unlikely(hdr->pnfs_error)) pnfs_ld_handle_read_error(hdr); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 31d99b2927b0..5c295512c967 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -236,7 +236,7 @@ void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo); void pnfs_put_lseg(struct pnfs_layout_segment *lseg); void pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg); -void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32); +void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *); void unset_pnfs_layoutdriver(struct nfs_server *); void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); @@ -657,7 +657,8 @@ pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task) } static inline void set_pnfs_layoutdriver(struct nfs_server *s, - const struct nfs_fh *mntfh, u32 id) + const struct nfs_fh *mntfh, + struct nfs_fsinfo *fsinfo) { } diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index f3468b57a32a..53b4705abcc7 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -690,13 +690,50 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, dprintk("%s: DS %s: trying address %s\n", __func__, ds->ds_remotestr, da->da_remotestr); - clp = nfs4_set_ds_client(mds_srv, - (struct sockaddr *)&da->da_addr, - da->da_addrlen, IPPROTO_TCP, - timeo, retrans, minor_version, - au_flavor); - if (!IS_ERR(clp)) - break; + if (!IS_ERR(clp) && clp->cl_mvops->session_trunk) { + struct xprt_create xprt_args = { + .ident = XPRT_TRANSPORT_TCP, + .net = clp->cl_net, + .dstaddr = (struct sockaddr *)&da->da_addr, + .addrlen = da->da_addrlen, + .servername = clp->cl_hostname, + }; + struct nfs4_add_xprt_data xprtdata = { + .clp = clp, + .cred = nfs4_get_clid_cred(clp), + }; + struct rpc_add_xprt_test rpcdata = { + .add_xprt_test = clp->cl_mvops->session_trunk, + .data = &xprtdata, + }; + + /** + * Test this address for session trunking and + * add as an alias + */ + rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, + rpc_clnt_setup_test_and_add_xprt, + &rpcdata); + if (xprtdata.cred) + put_rpccred(xprtdata.cred); + } else { + clp = nfs4_set_ds_client(mds_srv, + (struct sockaddr *)&da->da_addr, + da->da_addrlen, IPPROTO_TCP, + timeo, retrans, minor_version, + au_flavor); + if (IS_ERR(clp)) + continue; + + status = nfs4_init_ds_session(clp, + mds_srv->nfs_client->cl_lease_time); + if (status) { + nfs_put_client(clp); + clp = ERR_PTR(-EIO); + continue; + } + + } } if (IS_ERR(clp)) { @@ -704,18 +741,11 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, goto out; } - status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time); - if (status) - goto out_put; - smp_wmb(); ds->ds_clp = clp; dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr); out: return status; -out_put: - nfs_put_client(clp); - goto out; } /* diff --git a/fs/nfs/super.c b/fs/nfs/super.c index d39601381adf..001796bcd6c8 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2848,19 +2848,23 @@ out_invalid_transport_udp: * NFS client for backwards compatibility */ unsigned int nfs_callback_set_tcpport; +unsigned short nfs_callback_nr_threads; /* Default cache timeout is 10 minutes */ unsigned int nfs_idmap_cache_timeout = 600; /* Turn off NFSv4 uid/gid mapping when using AUTH_SYS */ bool nfs4_disable_idmapping = true; unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE; +unsigned short max_session_cb_slots = NFS4_DEF_CB_SLOT_TABLE_SIZE; unsigned short send_implementation_id = 1; char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = ""; bool recover_lost_locks = false; +EXPORT_SYMBOL_GPL(nfs_callback_nr_threads); EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport); EXPORT_SYMBOL_GPL(nfs_idmap_cache_timeout); EXPORT_SYMBOL_GPL(nfs4_disable_idmapping); EXPORT_SYMBOL_GPL(max_session_slots); +EXPORT_SYMBOL_GPL(max_session_cb_slots); EXPORT_SYMBOL_GPL(send_implementation_id); EXPORT_SYMBOL_GPL(nfs4_client_id_uniquifier); EXPORT_SYMBOL_GPL(recover_lost_locks); @@ -2887,6 +2891,9 @@ static const struct kernel_param_ops param_ops_portnr = { #define param_check_portnr(name, p) __param_check(name, p, unsigned int); module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644); +module_param_named(callback_nr_threads, nfs_callback_nr_threads, ushort, 0644); +MODULE_PARM_DESC(callback_nr_threads, "Number of threads that will be " + "assigned to the NFSv4 callback channels."); module_param(nfs_idmap_cache_timeout, int, 0644); module_param(nfs4_disable_idmapping, bool, 0644); module_param_string(nfs4_unique_id, nfs4_client_id_uniquifier, @@ -2896,6 +2903,9 @@ MODULE_PARM_DESC(nfs4_disable_idmapping, module_param(max_session_slots, ushort, 0644); MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 " "requests the client will negotiate"); +module_param(max_session_cb_slots, ushort, 0644); +MODULE_PARM_DESC(max_session_slots, "Maximum number of parallel NFSv4.1 " + "callbacks the client will process for a given server"); module_param(send_implementation_id, ushort, 0644); MODULE_PARM_DESC(send_implementation_id, "Send implementation ID with NFSv4.1 exchange_id"); diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c index df880e9fa71f..b67287383010 100644 --- a/fs/nfsd/flexfilelayout.c +++ b/fs/nfsd/flexfilelayout.c @@ -126,6 +126,7 @@ nfsd4_ff_proc_getdeviceinfo(struct super_block *sb, struct svc_rqst *rqstp, const struct nfsd4_layout_ops ff_layout_ops = { .notify_types = NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE, + .disable_recalls = true, .proc_getdeviceinfo = nfsd4_ff_proc_getdeviceinfo, .encode_getdeviceinfo = nfsd4_ff_encode_getdeviceinfo, .proc_layoutget = nfsd4_ff_proc_layoutget, diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 5fbf3bbd00d0..b10d557f9c9e 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -84,6 +84,7 @@ struct nfsd_net { struct list_head client_lru; struct list_head close_lru; struct list_head del_recall_lru; + struct list_head blocked_locks_lru; struct delayed_work laundromat_work; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 04c68d900324..211dc2aed8e1 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -448,7 +448,7 @@ static int decode_cb_sequence4res(struct xdr_stream *xdr, { int status; - if (cb->cb_minorversion == 0) + if (cb->cb_clp->cl_minorversion == 0) return 0; status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &cb->cb_seq_status); @@ -485,7 +485,7 @@ static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr, const struct nfs4_delegation *dp = cb_to_delegation(cb); struct nfs4_cb_compound_hdr hdr = { .ident = cb->cb_clp->cl_cb_ident, - .minorversion = cb->cb_minorversion, + .minorversion = cb->cb_clp->cl_minorversion, }; encode_cb_compound4args(xdr, &hdr); @@ -594,7 +594,7 @@ static void nfs4_xdr_enc_cb_layout(struct rpc_rqst *req, container_of(cb, struct nfs4_layout_stateid, ls_recall); struct nfs4_cb_compound_hdr hdr = { .ident = 0, - .minorversion = cb->cb_minorversion, + .minorversion = cb->cb_clp->cl_minorversion, }; encode_cb_compound4args(xdr, &hdr); @@ -623,6 +623,62 @@ static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, } #endif /* CONFIG_NFSD_PNFS */ +static void encode_stateowner(struct xdr_stream *xdr, struct nfs4_stateowner *so) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 8 + 4 + so->so_owner.len); + p = xdr_encode_opaque_fixed(p, &so->so_client->cl_clientid, 8); + xdr_encode_opaque(p, so->so_owner.data, so->so_owner.len); +} + +static void nfs4_xdr_enc_cb_notify_lock(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfsd4_callback *cb) +{ + const struct nfsd4_blocked_lock *nbl = + container_of(cb, struct nfsd4_blocked_lock, nbl_cb); + struct nfs4_lockowner *lo = (struct nfs4_lockowner *)nbl->nbl_lock.fl_owner; + struct nfs4_cb_compound_hdr hdr = { + .ident = 0, + .minorversion = cb->cb_clp->cl_minorversion, + }; + + __be32 *p; + + BUG_ON(hdr.minorversion == 0); + + encode_cb_compound4args(xdr, &hdr); + encode_cb_sequence4args(xdr, cb, &hdr); + + p = xdr_reserve_space(xdr, 4); + *p = cpu_to_be32(OP_CB_NOTIFY_LOCK); + encode_nfs_fh4(xdr, &nbl->nbl_fh); + encode_stateowner(xdr, &lo->lo_owner); + hdr.nops++; + + encode_cb_nops(&hdr); +} + +static int nfs4_xdr_dec_cb_notify_lock(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfsd4_callback *cb) +{ + struct nfs4_cb_compound_hdr hdr; + int status; + + status = decode_cb_compound4res(xdr, &hdr); + if (unlikely(status)) + return status; + + if (cb) { + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + } + return decode_cb_op_status(xdr, OP_CB_NOTIFY_LOCK, &cb->cb_status); +} + /* * RPC procedure tables */ @@ -643,6 +699,7 @@ static struct rpc_procinfo nfs4_cb_procedures[] = { #ifdef CONFIG_NFSD_PNFS PROC(CB_LAYOUT, COMPOUND, cb_layout, cb_layout), #endif + PROC(CB_NOTIFY_LOCK, COMPOUND, cb_notify_lock, cb_notify_lock), }; static struct rpc_version nfs_cb_version4 = { @@ -862,7 +919,6 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) struct nfs4_client *clp = cb->cb_clp; u32 minorversion = clp->cl_minorversion; - cb->cb_minorversion = minorversion; /* * cb_seq_status is only set in decode_cb_sequence4res, * and so will remain 1 if an rpc level failure occurs. diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 2be9602b0221..42aace4fc4c8 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -174,7 +174,8 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid) list_del_init(&ls->ls_perfile); spin_unlock(&fp->fi_lock); - vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls); + if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls) + vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls); fput(ls->ls_file); if (ls->ls_recalled) @@ -189,6 +190,9 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls) struct file_lock *fl; int status; + if (nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls) + return 0; + fl = locks_alloc_lock(); if (!fl) return -ENOMEM; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 1fb222752b2b..abb09b580389 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1010,47 +1010,97 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } static __be32 -nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, - struct nfsd4_clone *clone) +nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + stateid_t *src_stateid, struct file **src, + stateid_t *dst_stateid, struct file **dst) { - struct file *src, *dst; __be32 status; status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh, - &clone->cl_src_stateid, RD_STATE, - &src, NULL); + src_stateid, RD_STATE, src, NULL); if (status) { dprintk("NFSD: %s: couldn't process src stateid!\n", __func__); goto out; } status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, - &clone->cl_dst_stateid, WR_STATE, - &dst, NULL); + dst_stateid, WR_STATE, dst, NULL); if (status) { dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__); goto out_put_src; } /* fix up for NFS-specific error code */ - if (!S_ISREG(file_inode(src)->i_mode) || - !S_ISREG(file_inode(dst)->i_mode)) { + if (!S_ISREG(file_inode(*src)->i_mode) || + !S_ISREG(file_inode(*dst)->i_mode)) { status = nfserr_wrong_type; goto out_put_dst; } +out: + return status; +out_put_dst: + fput(*dst); +out_put_src: + fput(*src); + goto out; +} + +static __be32 +nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_clone *clone) +{ + struct file *src, *dst; + __be32 status; + + status = nfsd4_verify_copy(rqstp, cstate, &clone->cl_src_stateid, &src, + &clone->cl_dst_stateid, &dst); + if (status) + goto out; + status = nfsd4_clone_file_range(src, clone->cl_src_pos, dst, clone->cl_dst_pos, clone->cl_count); -out_put_dst: fput(dst); -out_put_src: fput(src); out: return status; } static __be32 +nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_copy *copy) +{ + struct file *src, *dst; + __be32 status; + ssize_t bytes; + + status = nfsd4_verify_copy(rqstp, cstate, ©->cp_src_stateid, &src, + ©->cp_dst_stateid, &dst); + if (status) + goto out; + + bytes = nfsd_copy_file_range(src, copy->cp_src_pos, + dst, copy->cp_dst_pos, copy->cp_count); + + if (bytes < 0) + status = nfserrno(bytes); + else { + copy->cp_res.wr_bytes_written = bytes; + copy->cp_res.wr_stable_how = NFS_UNSTABLE; + copy->cp_consecutive = 1; + copy->cp_synchronous = 1; + gen_boot_verifier(©->cp_res.wr_verifier, SVC_NET(rqstp)); + status = nfs_ok; + } + + fput(src); + fput(dst); +out: + return status; +} + +static __be32 nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_fallocate *fallocate, int flags) { @@ -1966,6 +2016,18 @@ static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd op_encode_channel_attrs_maxsz) * sizeof(__be32); } +static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + + 1 /* wr_callback */ + + op_encode_stateid_maxsz /* wr_callback */ + + 2 /* wr_count */ + + 1 /* wr_committed */ + + op_encode_verifier_maxsz + + 1 /* cr_consecutive */ + + 1 /* cr_synchronous */) * sizeof(__be32); +} + #ifdef CONFIG_NFSD_PNFS /* * At this stage we don't really know what layout driver will handle the request, @@ -2328,6 +2390,12 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_name = "OP_CLONE", .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, + [OP_COPY] = { + .op_func = (nfsd4op_func)nfsd4_copy, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_COPY", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_copy_rsize, + }, [OP_SEEK] = { .op_func = (nfsd4op_func)nfsd4_seek, .op_name = "OP_SEEK", diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 39bfaba9c99c..9752beb78659 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -99,6 +99,7 @@ static struct kmem_cache *odstate_slab; static void free_session(struct nfsd4_session *); static const struct nfsd4_callback_ops nfsd4_cb_recall_ops; +static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops; static bool is_session_dead(struct nfsd4_session *ses) { @@ -210,6 +211,85 @@ static void nfsd4_put_session(struct nfsd4_session *ses) spin_unlock(&nn->client_lock); } +static struct nfsd4_blocked_lock * +find_blocked_lock(struct nfs4_lockowner *lo, struct knfsd_fh *fh, + struct nfsd_net *nn) +{ + struct nfsd4_blocked_lock *cur, *found = NULL; + + spin_lock(&nn->client_lock); + list_for_each_entry(cur, &lo->lo_blocked, nbl_list) { + if (fh_match(fh, &cur->nbl_fh)) { + list_del_init(&cur->nbl_list); + list_del_init(&cur->nbl_lru); + found = cur; + break; + } + } + spin_unlock(&nn->client_lock); + if (found) + posix_unblock_lock(&found->nbl_lock); + return found; +} + +static struct nfsd4_blocked_lock * +find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh, + struct nfsd_net *nn) +{ + struct nfsd4_blocked_lock *nbl; + + nbl = find_blocked_lock(lo, fh, nn); + if (!nbl) { + nbl= kmalloc(sizeof(*nbl), GFP_KERNEL); + if (nbl) { + fh_copy_shallow(&nbl->nbl_fh, fh); + locks_init_lock(&nbl->nbl_lock); + nfsd4_init_cb(&nbl->nbl_cb, lo->lo_owner.so_client, + &nfsd4_cb_notify_lock_ops, + NFSPROC4_CLNT_CB_NOTIFY_LOCK); + } + } + return nbl; +} + +static void +free_blocked_lock(struct nfsd4_blocked_lock *nbl) +{ + locks_release_private(&nbl->nbl_lock); + kfree(nbl); +} + +static int +nfsd4_cb_notify_lock_done(struct nfsd4_callback *cb, struct rpc_task *task) +{ + /* + * Since this is just an optimization, we don't try very hard if it + * turns out not to succeed. We'll requeue it on NFS4ERR_DELAY, and + * just quit trying on anything else. + */ + switch (task->tk_status) { + case -NFS4ERR_DELAY: + rpc_delay(task, 1 * HZ); + return 0; + default: + return 1; + } +} + +static void +nfsd4_cb_notify_lock_release(struct nfsd4_callback *cb) +{ + struct nfsd4_blocked_lock *nbl = container_of(cb, + struct nfsd4_blocked_lock, nbl_cb); + + free_blocked_lock(nbl); +} + +static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops = { + .done = nfsd4_cb_notify_lock_done, + .release = nfsd4_cb_notify_lock_release, +}; + static inline struct nfs4_stateowner * nfs4_get_stateowner(struct nfs4_stateowner *sop) { @@ -3224,9 +3304,10 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, goto out; /* cases below refer to rfc 3530 section 14.2.34: */ if (!unconf || !same_verf(&confirm, &unconf->cl_confirm)) { - if (conf && !unconf) /* case 2: probable retransmit */ + if (conf && same_verf(&confirm, &conf->cl_confirm)) { + /* case 2: probable retransmit */ status = nfs_ok; - else /* case 4: client hasn't noticed we rebooted yet? */ + } else /* case 4: client hasn't noticed we rebooted yet? */ status = nfserr_stale_clientid; goto out; } @@ -4410,9 +4491,11 @@ out: * To finish the open response, we just need to set the rflags. */ open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX; - if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED) && - !nfsd4_has_session(&resp->cstate)) + if (nfsd4_has_session(&resp->cstate)) + open->op_rflags |= NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK; + else if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED)) open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM; + if (dp) nfs4_put_stid(&dp->dl_stid); if (stp) @@ -4501,6 +4584,7 @@ nfs4_laundromat(struct nfsd_net *nn) struct nfs4_openowner *oo; struct nfs4_delegation *dp; struct nfs4_ol_stateid *stp; + struct nfsd4_blocked_lock *nbl; struct list_head *pos, *next, reaplist; time_t cutoff = get_seconds() - nn->nfsd4_lease; time_t t, new_timeo = nn->nfsd4_lease; @@ -4569,6 +4653,41 @@ nfs4_laundromat(struct nfsd_net *nn) } spin_unlock(&nn->client_lock); + /* + * It's possible for a client to try and acquire an already held lock + * that is being held for a long time, and then lose interest in it. + * So, we clean out any un-revisited request after a lease period + * under the assumption that the client is no longer interested. + * + * RFC5661, sec. 9.6 states that the client must not rely on getting + * notifications and must continue to poll for locks, even when the + * server supports them. Thus this shouldn't lead to clients blocking + * indefinitely once the lock does become free. + */ + BUG_ON(!list_empty(&reaplist)); + spin_lock(&nn->client_lock); + while (!list_empty(&nn->blocked_locks_lru)) { + nbl = list_first_entry(&nn->blocked_locks_lru, + struct nfsd4_blocked_lock, nbl_lru); + if (time_after((unsigned long)nbl->nbl_time, + (unsigned long)cutoff)) { + t = nbl->nbl_time - cutoff; + new_timeo = min(new_timeo, t); + break; + } + list_move(&nbl->nbl_lru, &reaplist); + list_del_init(&nbl->nbl_list); + } + spin_unlock(&nn->client_lock); + + while (!list_empty(&reaplist)) { + nbl = list_first_entry(&nn->blocked_locks_lru, + struct nfsd4_blocked_lock, nbl_lru); + list_del_init(&nbl->nbl_lru); + posix_unblock_lock(&nbl->nbl_lock); + free_blocked_lock(nbl); + } + new_timeo = max_t(time_t, new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT); return new_timeo; } @@ -5309,7 +5428,31 @@ nfsd4_fl_put_owner(fl_owner_t owner) nfs4_put_stateowner(&lo->lo_owner); } +static void +nfsd4_lm_notify(struct file_lock *fl) +{ + struct nfs4_lockowner *lo = (struct nfs4_lockowner *)fl->fl_owner; + struct net *net = lo->lo_owner.so_client->net; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct nfsd4_blocked_lock *nbl = container_of(fl, + struct nfsd4_blocked_lock, nbl_lock); + bool queue = false; + + /* An empty list means that something else is going to be using it */ + spin_lock(&nn->client_lock); + if (!list_empty(&nbl->nbl_list)) { + list_del_init(&nbl->nbl_list); + list_del_init(&nbl->nbl_lru); + queue = true; + } + spin_unlock(&nn->client_lock); + + if (queue) + nfsd4_run_cb(&nbl->nbl_cb); +} + static const struct lock_manager_operations nfsd_posix_mng_ops = { + .lm_notify = nfsd4_lm_notify, .lm_get_owner = nfsd4_fl_get_owner, .lm_put_owner = nfsd4_fl_put_owner, }; @@ -5407,6 +5550,7 @@ alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp); if (!lo) return NULL; + INIT_LIST_HEAD(&lo->lo_blocked); INIT_LIST_HEAD(&lo->lo_owner.so_stateids); lo->lo_owner.so_is_open_owner = 0; lo->lo_owner.so_seqid = lock->lk_new_lock_seqid; @@ -5588,12 +5732,15 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *open_stp = NULL; struct nfs4_file *fp; struct file *filp = NULL; + struct nfsd4_blocked_lock *nbl = NULL; struct file_lock *file_lock = NULL; struct file_lock *conflock = NULL; __be32 status = 0; int lkflg; int err; bool new = false; + unsigned char fl_type; + unsigned int fl_flags = FL_POSIX; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -5658,46 +5805,55 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!locks_in_grace(net) && lock->lk_reclaim) goto out; - file_lock = locks_alloc_lock(); - if (!file_lock) { - dprintk("NFSD: %s: unable to allocate lock!\n", __func__); - status = nfserr_jukebox; - goto out; - } - fp = lock_stp->st_stid.sc_file; switch (lock->lk_type) { - case NFS4_READ_LT: case NFS4_READW_LT: + if (nfsd4_has_session(cstate)) + fl_flags |= FL_SLEEP; + /* Fallthrough */ + case NFS4_READ_LT: spin_lock(&fp->fi_lock); filp = find_readable_file_locked(fp); if (filp) get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ); spin_unlock(&fp->fi_lock); - file_lock->fl_type = F_RDLCK; + fl_type = F_RDLCK; break; - case NFS4_WRITE_LT: case NFS4_WRITEW_LT: + if (nfsd4_has_session(cstate)) + fl_flags |= FL_SLEEP; + /* Fallthrough */ + case NFS4_WRITE_LT: spin_lock(&fp->fi_lock); filp = find_writeable_file_locked(fp); if (filp) get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE); spin_unlock(&fp->fi_lock); - file_lock->fl_type = F_WRLCK; + fl_type = F_WRLCK; break; default: status = nfserr_inval; goto out; } + if (!filp) { status = nfserr_openmode; goto out; } + nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn); + if (!nbl) { + dprintk("NFSD: %s: unable to allocate block!\n", __func__); + status = nfserr_jukebox; + goto out; + } + + file_lock = &nbl->nbl_lock; + file_lock->fl_type = fl_type; file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner)); file_lock->fl_pid = current->tgid; file_lock->fl_file = filp; - file_lock->fl_flags = FL_POSIX; + file_lock->fl_flags = fl_flags; file_lock->fl_lmops = &nfsd_posix_mng_ops; file_lock->fl_start = lock->lk_offset; file_lock->fl_end = last_byte_offset(lock->lk_offset, lock->lk_length); @@ -5710,18 +5866,29 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } + if (fl_flags & FL_SLEEP) { + nbl->nbl_time = jiffies; + spin_lock(&nn->client_lock); + list_add_tail(&nbl->nbl_list, &lock_sop->lo_blocked); + list_add_tail(&nbl->nbl_lru, &nn->blocked_locks_lru); + spin_unlock(&nn->client_lock); + } + err = vfs_lock_file(filp, F_SETLK, file_lock, conflock); - switch (-err) { + switch (err) { case 0: /* success! */ nfs4_inc_and_copy_stateid(&lock->lk_resp_stateid, &lock_stp->st_stid); status = 0; break; - case (EAGAIN): /* conflock holds conflicting lock */ + case FILE_LOCK_DEFERRED: + nbl = NULL; + /* Fallthrough */ + case -EAGAIN: /* conflock holds conflicting lock */ status = nfserr_denied; dprintk("NFSD: nfsd4_lock: conflicting lock found!\n"); nfs4_set_lock_denied(conflock, &lock->lk_denied); break; - case (EDEADLK): + case -EDEADLK: status = nfserr_deadlock; break; default: @@ -5730,6 +5897,16 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, break; } out: + if (nbl) { + /* dequeue it if we queued it before */ + if (fl_flags & FL_SLEEP) { + spin_lock(&nn->client_lock); + list_del_init(&nbl->nbl_list); + list_del_init(&nbl->nbl_lru); + spin_unlock(&nn->client_lock); + } + free_blocked_lock(nbl); + } if (filp) fput(filp); if (lock_stp) { @@ -5753,8 +5930,6 @@ out: if (open_stp) nfs4_put_stid(&open_stp->st_stid); nfsd4_bump_seqid(cstate, status); - if (file_lock) - locks_free_lock(file_lock); if (conflock) locks_free_lock(conflock); return status; @@ -6768,6 +6943,7 @@ static int nfs4_state_create_net(struct net *net) INIT_LIST_HEAD(&nn->client_lru); INIT_LIST_HEAD(&nn->close_lru); INIT_LIST_HEAD(&nn->del_recall_lru); + INIT_LIST_HEAD(&nn->blocked_locks_lru); spin_lock_init(&nn->client_lock); INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main); @@ -6865,6 +7041,7 @@ nfs4_state_shutdown_net(struct net *net) struct nfs4_delegation *dp = NULL; struct list_head *pos, *next, reaplist; struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct nfsd4_blocked_lock *nbl; cancel_delayed_work_sync(&nn->laundromat_work); locks_end_grace(&nn->nfsd4_manager); @@ -6885,6 +7062,24 @@ nfs4_state_shutdown_net(struct net *net) nfs4_put_stid(&dp->dl_stid); } + BUG_ON(!list_empty(&reaplist)); + spin_lock(&nn->client_lock); + while (!list_empty(&nn->blocked_locks_lru)) { + nbl = list_first_entry(&nn->blocked_locks_lru, + struct nfsd4_blocked_lock, nbl_lru); + list_move(&nbl->nbl_lru, &reaplist); + list_del_init(&nbl->nbl_list); + } + spin_unlock(&nn->client_lock); + + while (!list_empty(&reaplist)) { + nbl = list_first_entry(&nn->blocked_locks_lru, + struct nfsd4_blocked_lock, nbl_lru); + list_del_init(&nbl->nbl_lru); + posix_unblock_lock(&nbl->nbl_lock); + free_blocked_lock(nbl); + } + nfsd4_client_tracking_exit(net); nfs4_state_destroy_net(net); } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 0aa0236a1429..c2d2895a1ec1 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1694,6 +1694,30 @@ nfsd4_decode_clone(struct nfsd4_compoundargs *argp, struct nfsd4_clone *clone) } static __be32 +nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy) +{ + DECODE_HEAD; + unsigned int tmp; + + status = nfsd4_decode_stateid(argp, ©->cp_src_stateid); + if (status) + return status; + status = nfsd4_decode_stateid(argp, ©->cp_dst_stateid); + if (status) + return status; + + READ_BUF(8 + 8 + 8 + 4 + 4 + 4); + p = xdr_decode_hyper(p, ©->cp_src_pos); + p = xdr_decode_hyper(p, ©->cp_dst_pos); + p = xdr_decode_hyper(p, ©->cp_count); + copy->cp_consecutive = be32_to_cpup(p++); + copy->cp_synchronous = be32_to_cpup(p++); + tmp = be32_to_cpup(p); /* Source server list not supported */ + + DECODE_TAIL; +} + +static __be32 nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek) { DECODE_HEAD; @@ -1793,7 +1817,7 @@ static nfsd4_dec nfsd4_dec_ops[] = { /* new operations for NFSv4.2 */ [OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate, - [OP_COPY] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_COPY] = (nfsd4_dec)nfsd4_decode_copy, [OP_COPY_NOTIFY] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate, [OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp, @@ -4062,7 +4086,7 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, u32 starting_len = xdr->buf->len, needed_len; __be32 *p; - dprintk("%s: err %d\n", __func__, nfserr); + dprintk("%s: err %d\n", __func__, be32_to_cpu(nfserr)); if (nfserr) goto out; @@ -4202,6 +4226,41 @@ nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, #endif /* CONFIG_NFSD_PNFS */ static __be32 +nfsd42_encode_write_res(struct nfsd4_compoundres *resp, struct nfsd42_write_res *write) +{ + __be32 *p; + + p = xdr_reserve_space(&resp->xdr, 4 + 8 + 4 + NFS4_VERIFIER_SIZE); + if (!p) + return nfserr_resource; + + *p++ = cpu_to_be32(0); + p = xdr_encode_hyper(p, write->wr_bytes_written); + *p++ = cpu_to_be32(write->wr_stable_how); + p = xdr_encode_opaque_fixed(p, write->wr_verifier.data, + NFS4_VERIFIER_SIZE); + return nfs_ok; +} + +static __be32 +nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_copy *copy) +{ + __be32 *p; + + if (!nfserr) { + nfserr = nfsd42_encode_write_res(resp, ©->cp_res); + if (nfserr) + return nfserr; + + p = xdr_reserve_space(&resp->xdr, 4 + 4); + *p++ = cpu_to_be32(copy->cp_consecutive); + *p++ = cpu_to_be32(copy->cp_synchronous); + } + return nfserr; +} + +static __be32 nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_seek *seek) { @@ -4300,7 +4359,7 @@ static nfsd4_enc nfsd4_enc_ops[] = { /* NFSv4.2 operations */ [OP_ALLOCATE] = (nfsd4_enc)nfsd4_encode_noop, - [OP_COPY] = (nfsd4_enc)nfsd4_encode_noop, + [OP_COPY] = (nfsd4_enc)nfsd4_encode_copy, [OP_COPY_NOTIFY] = (nfsd4_enc)nfsd4_encode_noop, [OP_DEALLOCATE] = (nfsd4_enc)nfsd4_encode_noop, [OP_IO_ADVISE] = (nfsd4_enc)nfsd4_encode_noop, diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 65ad0165a94f..36b2af931e06 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1216,6 +1216,8 @@ static __net_init int nfsd_init_net(struct net *net) goto out_idmap_error; nn->nfsd4_lease = 90; /* default lease time */ nn->nfsd4_grace = 90; + nn->clverifier_counter = prandom_u32(); + nn->clientid_counter = prandom_u32(); return 0; out_idmap_error: diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 08188743db53..010aff5c5a79 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -789,6 +789,7 @@ nfserrno (int errno) { nfserr_toosmall, -ETOOSMALL }, { nfserr_serverfault, -ESERVERFAULT }, { nfserr_serverfault, -ENFILE }, + { nfserr_io, -EUCLEAN }, }; int i; @@ -796,7 +797,7 @@ nfserrno (int errno) if (nfs_errtbl[i].syserr == errno) return nfs_errtbl[i].nfserr; } - WARN(1, "nfsd: non-standard errno: %d\n", errno); + WARN_ONCE(1, "nfsd: non-standard errno: %d\n", errno); return nfserr_io; } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 45007acaf364..a2b65fc56dd6 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -366,14 +366,21 @@ static struct notifier_block nfsd_inet6addr_notifier = { }; #endif +/* Only used under nfsd_mutex, so this atomic may be overkill: */ +static atomic_t nfsd_notifier_refcount = ATOMIC_INIT(0); + static void nfsd_last_thread(struct svc_serv *serv, struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); - unregister_inetaddr_notifier(&nfsd_inetaddr_notifier); + /* check if the notifier still has clients */ + if (atomic_dec_return(&nfsd_notifier_refcount) == 0) { + unregister_inetaddr_notifier(&nfsd_inetaddr_notifier); #if IS_ENABLED(CONFIG_IPV6) - unregister_inet6addr_notifier(&nfsd_inet6addr_notifier); + unregister_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif + } + /* * write_ports can create the server without actually starting * any threads--if we get shut down before any threads are @@ -488,10 +495,13 @@ int nfsd_create_serv(struct net *net) } set_max_drc(); - register_inetaddr_notifier(&nfsd_inetaddr_notifier); + /* check if the notifier is already set */ + if (atomic_inc_return(&nfsd_notifier_refcount) == 1) { + register_inetaddr_notifier(&nfsd_inetaddr_notifier); #if IS_ENABLED(CONFIG_IPV6) - register_inet6addr_notifier(&nfsd_inet6addr_notifier); + register_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif + } do_gettimeofday(&nn->nfssvc_boot); /* record boot time */ return 0; } diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h index 0c2a716e8741..d27a5aa60022 100644 --- a/fs/nfsd/pnfs.h +++ b/fs/nfsd/pnfs.h @@ -19,6 +19,7 @@ struct nfsd4_deviceid_map { struct nfsd4_layout_ops { u32 notify_types; + bool disable_recalls; __be32 (*proc_getdeviceinfo)(struct super_block *sb, struct svc_rqst *rqstp, diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index b95adf9a1595..c9399366f9df 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -63,7 +63,6 @@ typedef struct { struct nfsd4_callback { struct nfs4_client *cb_clp; - u32 cb_minorversion; struct rpc_message cb_msg; const struct nfsd4_callback_ops *cb_ops; struct work_struct cb_work; @@ -441,11 +440,11 @@ struct nfs4_openowner { /* * Represents a generic "lockowner". Similar to an openowner. References to it * are held by the lock stateids that are created on its behalf. This object is - * a superset of the nfs4_stateowner struct (or would be if it needed any extra - * fields). + * a superset of the nfs4_stateowner struct. */ struct nfs4_lockowner { - struct nfs4_stateowner lo_owner; /* must be first element */ + struct nfs4_stateowner lo_owner; /* must be first element */ + struct list_head lo_blocked; /* blocked file_locks */ }; static inline struct nfs4_openowner * openowner(struct nfs4_stateowner *so) @@ -572,6 +571,7 @@ enum nfsd4_cb_op { NFSPROC4_CLNT_CB_RECALL, NFSPROC4_CLNT_CB_LAYOUT, NFSPROC4_CLNT_CB_SEQUENCE, + NFSPROC4_CLNT_CB_NOTIFY_LOCK, }; /* Returns true iff a is later than b: */ @@ -580,6 +580,20 @@ static inline bool nfsd4_stateid_generation_after(stateid_t *a, stateid_t *b) return (s32)(a->si_generation - b->si_generation) > 0; } +/* + * When a client tries to get a lock on a file, we set one of these objects + * on the blocking lock. When the lock becomes free, we can then issue a + * CB_NOTIFY_LOCK to the server. + */ +struct nfsd4_blocked_lock { + struct list_head nbl_list; + struct list_head nbl_lru; + unsigned long nbl_time; + struct file_lock nbl_lock; + struct knfsd_fh nbl_fh; + struct nfsd4_callback nbl_cb; +}; + struct nfsd4_compound_state; struct nfsd_net; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index ff476e654b8f..8ca642fe9b21 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -513,6 +513,22 @@ __be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst, count)); } +ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst, + u64 dst_pos, u64 count) +{ + + /* + * Limit copy to 4MB to prevent indefinitely blocking an nfsd + * thread and client rpc slot. The choice of 4MB is somewhat + * arbitrary. We might instead base this on r/wsize, or make it + * tunable, or use a time instead of a byte limit, or implement + * asynchronous copy. In theory a client could also recognize a + * limit like this and pipeline multiple COPY requests. + */ + count = min_t(u64, count, 1 << 22); + return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0); +} + __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, loff_t len, int flags) diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 3cbb1b33777b..0bf9e7bf5800 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -96,6 +96,8 @@ __be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *, struct svc_fh *res); __be32 nfsd_link(struct svc_rqst *, struct svc_fh *, char *, int, struct svc_fh *); +ssize_t nfsd_copy_file_range(struct file *, u64, + struct file *, u64, u64); __be32 nfsd_rename(struct svc_rqst *, struct svc_fh *, char *, int, struct svc_fh *, char *, int); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index beea0c5edc51..8fda4abdf3b1 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -503,6 +503,28 @@ struct nfsd4_clone { u64 cl_count; }; +struct nfsd42_write_res { + u64 wr_bytes_written; + u32 wr_stable_how; + nfs4_verifier wr_verifier; +}; + +struct nfsd4_copy { + /* request */ + stateid_t cp_src_stateid; + stateid_t cp_dst_stateid; + u64 cp_src_pos; + u64 cp_dst_pos; + u64 cp_count; + + /* both */ + bool cp_consecutive; + bool cp_synchronous; + + /* response */ + struct nfsd42_write_res cp_res; +}; + struct nfsd4_seek { /* request */ stateid_t seek_stateid; @@ -568,6 +590,7 @@ struct nfsd4_op { struct nfsd4_fallocate allocate; struct nfsd4_fallocate deallocate; struct nfsd4_clone clone; + struct nfsd4_copy copy; struct nfsd4_seek seek; } u; struct nfs4_replay * replay; diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h index c47f6fdb111a..49b719dfef95 100644 --- a/fs/nfsd/xdr4cb.h +++ b/fs/nfsd/xdr4cb.h @@ -28,3 +28,12 @@ #define NFS4_dec_cb_layout_sz (cb_compound_dec_hdr_sz + \ cb_sequence_dec_sz + \ op_dec_sz) + +#define NFS4_enc_cb_notify_lock_sz (cb_compound_enc_hdr_sz + \ + cb_sequence_enc_sz + \ + 2 + 1 + \ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ + enc_nfs4_fh_sz) +#define NFS4_dec_cb_notify_lock_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 6ea06f8a7d29..3f828a187049 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -3188,6 +3188,9 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, migrate->new_master, migrate->master); + if (ret < 0) + kmem_cache_free(dlm_mle_cache, mle); + spin_unlock(&dlm->master_lock); unlock: spin_unlock(&dlm->spinlock); diff --git a/fs/open.c b/fs/open.c index 8aeb08bb278b..d3ed8171e8e0 100644 --- a/fs/open.c +++ b/fs/open.c @@ -267,6 +267,11 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) (mode & ~FALLOC_FL_INSERT_RANGE)) return -EINVAL; + /* Unshare range should only be used with allocate mode. */ + if ((mode & FALLOC_FL_UNSHARE_RANGE) && + (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE))) + return -EINVAL; + if (!(file->f_mode & FMODE_WRITE)) return -EBADF; @@ -300,7 +305,8 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) * Let individual file system decide if it supports preallocation * for directories or not. */ - if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) && + !S_ISBLK(inode->i_mode)) return -ENODEV; /* Check for wrap through zero too */ diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 3f803b3a1f82..aeb60f791418 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -57,6 +57,7 @@ int ovl_copy_xattr(struct dentry *old, struct dentry *new) ssize_t list_size, size, value_size = 0; char *buf, *name, *value = NULL; int uninitialized_var(error); + size_t slen; if (!(old->d_inode->i_opflags & IOP_XATTR) || !(new->d_inode->i_opflags & IOP_XATTR)) @@ -79,7 +80,16 @@ int ovl_copy_xattr(struct dentry *old, struct dentry *new) goto out; } - for (name = buf; name < (buf + list_size); name += strlen(name) + 1) { + for (name = buf; list_size; name += slen) { + slen = strnlen(name, list_size) + 1; + + /* underlying fs providing us with an broken xattr list? */ + if (WARN_ON(slen > list_size)) { + error = -EIO; + break; + } + list_size -= slen; + if (ovl_is_private_xattr(name)) continue; retry: @@ -174,40 +184,6 @@ out_fput: return error; } -static char *ovl_read_symlink(struct dentry *realdentry) -{ - int res; - char *buf; - struct inode *inode = realdentry->d_inode; - mm_segment_t old_fs; - - res = -EINVAL; - if (!inode->i_op->readlink) - goto err; - - res = -ENOMEM; - buf = (char *) __get_free_page(GFP_KERNEL); - if (!buf) - goto err; - - old_fs = get_fs(); - set_fs(get_ds()); - /* The cast to a user pointer is valid due to the set_fs() */ - res = inode->i_op->readlink(realdentry, - (char __user *)buf, PAGE_SIZE - 1); - set_fs(old_fs); - if (res < 0) { - free_page((unsigned long) buf); - goto err; - } - buf[res] = '\0'; - - return buf; - -err: - return ERR_PTR(res); -} - static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat) { struct iattr attr = { @@ -354,19 +330,20 @@ out_cleanup: int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, struct path *lowerpath, struct kstat *stat) { + DEFINE_DELAYED_CALL(done); struct dentry *workdir = ovl_workdir(dentry); int err; struct kstat pstat; struct path parentpath; + struct dentry *lowerdentry = lowerpath->dentry; struct dentry *upperdir; struct dentry *upperdentry; - const struct cred *old_cred; - char *link = NULL; + const char *link = NULL; if (WARN_ON(!workdir)) return -EROFS; - ovl_do_check_copy_up(lowerpath->dentry); + ovl_do_check_copy_up(lowerdentry); ovl_path_upper(parent, &parentpath); upperdir = parentpath.dentry; @@ -376,13 +353,11 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, return err; if (S_ISLNK(stat->mode)) { - link = ovl_read_symlink(lowerpath->dentry); + link = vfs_get_link(lowerdentry, &done); if (IS_ERR(link)) return PTR_ERR(link); } - old_cred = ovl_override_creds(dentry->d_sb); - err = -EIO; if (lock_rename(workdir, upperdir) != NULL) { pr_err("overlayfs: failed to lock workdir+upperdir\n"); @@ -403,19 +378,16 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, } out_unlock: unlock_rename(workdir, upperdir); - revert_creds(old_cred); - - if (link) - free_page((unsigned long) link); + do_delayed_call(&done); return err; } int ovl_copy_up(struct dentry *dentry) { - int err; + int err = 0; + const struct cred *old_cred = ovl_override_creds(dentry->d_sb); - err = 0; while (!err) { struct dentry *next; struct dentry *parent; @@ -447,6 +419,7 @@ int ovl_copy_up(struct dentry *dentry) dput(parent); dput(next); } + revert_creds(old_cred); return err; } diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 5f90ddf778ba..306b6c161840 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -14,6 +14,7 @@ #include <linux/cred.h> #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> +#include <linux/atomic.h> #include "overlayfs.h" void ovl_cleanup(struct inode *wdir, struct dentry *wdentry) @@ -37,8 +38,10 @@ struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry) { struct dentry *temp; char name[20]; + static atomic_t temp_id = ATOMIC_INIT(0); - snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry); + /* counter is allowed to wrap, since temp dentries are ephemeral */ + snprintf(name, sizeof(name), "#%x", atomic_inc_return(&temp_id)); temp = lookup_one_len(name, workdir, strlen(name)); if (!IS_ERR(temp) && temp->d_inode) { diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index c18d6a4ff456..c58f01babf30 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -19,6 +19,7 @@ static int ovl_copy_up_truncate(struct dentry *dentry) struct dentry *parent; struct kstat stat; struct path lowerpath; + const struct cred *old_cred; parent = dget_parent(dentry); err = ovl_copy_up(parent); @@ -26,12 +27,14 @@ static int ovl_copy_up_truncate(struct dentry *dentry) goto out_dput_parent; ovl_path_lower(dentry, &lowerpath); - err = vfs_getattr(&lowerpath, &stat); - if (err) - goto out_dput_parent; - stat.size = 0; - err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat); + old_cred = ovl_override_creds(dentry->d_sb); + err = vfs_getattr(&lowerpath, &stat); + if (!err) { + stat.size = 0; + err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat); + } + revert_creds(old_cred); out_dput_parent: dput(parent); @@ -153,45 +156,18 @@ static const char *ovl_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct dentry *realdentry; - struct inode *realinode; const struct cred *old_cred; const char *p; if (!dentry) return ERR_PTR(-ECHILD); - realdentry = ovl_dentry_real(dentry); - realinode = realdentry->d_inode; - - if (WARN_ON(!realinode->i_op->get_link)) - return ERR_PTR(-EPERM); - old_cred = ovl_override_creds(dentry->d_sb); - p = realinode->i_op->get_link(realdentry, realinode, done); + p = vfs_get_link(ovl_dentry_real(dentry), done); revert_creds(old_cred); return p; } -static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz) -{ - struct path realpath; - struct inode *realinode; - const struct cred *old_cred; - int err; - - ovl_path_real(dentry, &realpath); - realinode = realpath.dentry->d_inode; - - if (!realinode->i_op->readlink) - return -EINVAL; - - old_cred = ovl_override_creds(dentry->d_sb); - err = realinode->i_op->readlink(realpath.dentry, buf, bufsiz); - revert_creds(old_cred); - return err; -} - bool ovl_is_private_xattr(const char *name) { return strncmp(name, OVL_XATTR_PREFIX, @@ -375,7 +351,7 @@ static const struct inode_operations ovl_file_inode_operations = { static const struct inode_operations ovl_symlink_inode_operations = { .setattr = ovl_setattr, .get_link = ovl_get_link, - .readlink = ovl_readlink, + .readlink = generic_readlink, .getattr = ovl_getattr, .listxattr = ovl_listxattr, .update_time = ovl_update_time, diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 0ffc8da1aa3f..bcf3965be819 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -273,12 +273,11 @@ static bool ovl_is_opaquedir(struct dentry *dentry) { int res; char val; - struct inode *inode = dentry->d_inode; - if (!S_ISDIR(inode->i_mode) || !(inode->i_opflags & IOP_XATTR)) + if (!d_is_dir(dentry)) return false; - res = __vfs_getxattr(dentry, inode, OVL_XATTR_OPAQUE, &val, 1); + res = vfs_getxattr(dentry, OVL_XATTR_OPAQUE, &val, 1); if (res == 1 && val == 'y') return true; @@ -419,16 +418,12 @@ static bool ovl_dentry_weird(struct dentry *dentry) DCACHE_OP_COMPARE); } -static inline struct dentry *ovl_lookup_real(struct super_block *ovl_sb, - struct dentry *dir, +static inline struct dentry *ovl_lookup_real(struct dentry *dir, const struct qstr *name) { - const struct cred *old_cred; struct dentry *dentry; - old_cred = ovl_override_creds(ovl_sb); dentry = lookup_one_len_unlocked(name->name, dir, name->len); - revert_creds(old_cred); if (IS_ERR(dentry)) { if (PTR_ERR(dentry) == -ENOENT) @@ -469,6 +464,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct ovl_entry *oe; + const struct cred *old_cred; struct ovl_entry *poe = dentry->d_parent->d_fsdata; struct path *stack = NULL; struct dentry *upperdir, *upperdentry = NULL; @@ -479,9 +475,10 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int i; int err; + old_cred = ovl_override_creds(dentry->d_sb); upperdir = ovl_upperdentry_dereference(poe); if (upperdir) { - this = ovl_lookup_real(dentry->d_sb, upperdir, &dentry->d_name); + this = ovl_lookup_real(upperdir, &dentry->d_name); err = PTR_ERR(this); if (IS_ERR(this)) goto out; @@ -514,8 +511,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, bool opaque = false; struct path lowerpath = poe->lowerstack[i]; - this = ovl_lookup_real(dentry->d_sb, - lowerpath.dentry, &dentry->d_name); + this = ovl_lookup_real(lowerpath.dentry, &dentry->d_name); err = PTR_ERR(this); if (IS_ERR(this)) { /* @@ -588,6 +584,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, ovl_copyattr(realdentry->d_inode, inode); } + revert_creds(old_cred); oe->opaque = upperopaque; oe->__upperdentry = upperdentry; memcpy(oe->lowerstack, stack, sizeof(struct path) * ctr); @@ -606,6 +603,7 @@ out_put: out_put_upper: dput(upperdentry); out: + revert_creds(old_cred); return ERR_PTR(err); } @@ -834,6 +832,19 @@ retry: if (err) goto out_dput; + /* + * Try to remove POSIX ACL xattrs from workdir. We are good if: + * + * a) success (there was a POSIX ACL xattr and was removed) + * b) -ENODATA (there was no POSIX ACL xattr) + * c) -EOPNOTSUPP (POSIX ACL xattrs are not supported) + * + * There are various other error values that could effectively + * mean that the xattr doesn't exist (e.g. -ERANGE is returned + * if the xattr name is too long), but the set of filesystems + * allowed as upper are limited to "normal" ones, where checking + * for the above two errors is sufficient. + */ err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_DEFAULT); if (err && err != -ENODATA && err != -EOPNOTSUPP) goto out_dput; diff --git a/fs/pipe.c b/fs/pipe.c index 1f559f0608e1..8e0d9f26dfad 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -601,54 +601,63 @@ pipe_fasync(int fd, struct file *filp, int on) return retval; } -static void account_pipe_buffers(struct pipe_inode_info *pipe, +static unsigned long account_pipe_buffers(struct user_struct *user, unsigned long old, unsigned long new) { - atomic_long_add(new - old, &pipe->user->pipe_bufs); + return atomic_long_add_return(new - old, &user->pipe_bufs); } -static bool too_many_pipe_buffers_soft(struct user_struct *user) +static bool too_many_pipe_buffers_soft(unsigned long user_bufs) { - return pipe_user_pages_soft && - atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_soft; + return pipe_user_pages_soft && user_bufs >= pipe_user_pages_soft; } -static bool too_many_pipe_buffers_hard(struct user_struct *user) +static bool too_many_pipe_buffers_hard(unsigned long user_bufs) { - return pipe_user_pages_hard && - atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_hard; + return pipe_user_pages_hard && user_bufs >= pipe_user_pages_hard; } struct pipe_inode_info *alloc_pipe_info(void) { struct pipe_inode_info *pipe; + unsigned long pipe_bufs = PIPE_DEF_BUFFERS; + struct user_struct *user = get_current_user(); + unsigned long user_bufs; pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT); - if (pipe) { - unsigned long pipe_bufs = PIPE_DEF_BUFFERS; - struct user_struct *user = get_current_user(); - - if (!too_many_pipe_buffers_hard(user)) { - if (too_many_pipe_buffers_soft(user)) - pipe_bufs = 1; - pipe->bufs = kcalloc(pipe_bufs, - sizeof(struct pipe_buffer), - GFP_KERNEL_ACCOUNT); - } + if (pipe == NULL) + goto out_free_uid; - if (pipe->bufs) { - init_waitqueue_head(&pipe->wait); - pipe->r_counter = pipe->w_counter = 1; - pipe->buffers = pipe_bufs; - pipe->user = user; - account_pipe_buffers(pipe, 0, pipe_bufs); - mutex_init(&pipe->mutex); - return pipe; - } - free_uid(user); - kfree(pipe); + if (pipe_bufs * PAGE_SIZE > pipe_max_size && !capable(CAP_SYS_RESOURCE)) + pipe_bufs = pipe_max_size >> PAGE_SHIFT; + + user_bufs = account_pipe_buffers(user, 0, pipe_bufs); + + if (too_many_pipe_buffers_soft(user_bufs)) { + user_bufs = account_pipe_buffers(user, pipe_bufs, 1); + pipe_bufs = 1; + } + + if (too_many_pipe_buffers_hard(user_bufs)) + goto out_revert_acct; + + pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer), + GFP_KERNEL_ACCOUNT); + + if (pipe->bufs) { + init_waitqueue_head(&pipe->wait); + pipe->r_counter = pipe->w_counter = 1; + pipe->buffers = pipe_bufs; + pipe->user = user; + mutex_init(&pipe->mutex); + return pipe; } +out_revert_acct: + (void) account_pipe_buffers(user, pipe_bufs, 0); + kfree(pipe); +out_free_uid: + free_uid(user); return NULL; } @@ -656,7 +665,7 @@ void free_pipe_info(struct pipe_inode_info *pipe) { int i; - account_pipe_buffers(pipe, pipe->buffers, 0); + (void) account_pipe_buffers(pipe->user, pipe->buffers, 0); free_uid(pipe->user); for (i = 0; i < pipe->buffers; i++) { struct pipe_buffer *buf = pipe->bufs + i; @@ -1008,12 +1017,54 @@ const struct file_operations pipefifo_fops = { }; /* + * Currently we rely on the pipe array holding a power-of-2 number + * of pages. + */ +static inline unsigned int round_pipe_size(unsigned int size) +{ + unsigned long nr_pages; + + nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; +} + +/* * Allocate a new array of pipe buffers and copy the info over. Returns the * pipe size if successful, or return -ERROR on error. */ -static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) +static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) { struct pipe_buffer *bufs; + unsigned int size, nr_pages; + unsigned long user_bufs; + long ret = 0; + + size = round_pipe_size(arg); + nr_pages = size >> PAGE_SHIFT; + + if (!nr_pages) + return -EINVAL; + + /* + * If trying to increase the pipe capacity, check that an + * unprivileged user is not trying to exceed various limits + * (soft limit check here, hard limit check just below). + * Decreasing the pipe capacity is always permitted, even + * if the user is currently over a limit. + */ + if (nr_pages > pipe->buffers && + size > pipe_max_size && !capable(CAP_SYS_RESOURCE)) + return -EPERM; + + user_bufs = account_pipe_buffers(pipe->user, pipe->buffers, nr_pages); + + if (nr_pages > pipe->buffers && + (too_many_pipe_buffers_hard(user_bufs) || + too_many_pipe_buffers_soft(user_bufs)) && + !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_revert_acct; + } /* * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't @@ -1021,13 +1072,17 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) * again like we would do for growing. If the pipe currently * contains more buffers than arg, then return busy. */ - if (nr_pages < pipe->nrbufs) - return -EBUSY; + if (nr_pages < pipe->nrbufs) { + ret = -EBUSY; + goto out_revert_acct; + } bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); - if (unlikely(!bufs)) - return -ENOMEM; + if (unlikely(!bufs)) { + ret = -ENOMEM; + goto out_revert_acct; + } /* * The pipe array wraps around, so just start the new one at zero @@ -1050,24 +1105,15 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer)); } - account_pipe_buffers(pipe, pipe->buffers, nr_pages); pipe->curbuf = 0; kfree(pipe->bufs); pipe->bufs = bufs; pipe->buffers = nr_pages; return nr_pages * PAGE_SIZE; -} - -/* - * Currently we rely on the pipe array holding a power-of-2 number - * of pages. - */ -static inline unsigned int round_pipe_size(unsigned int size) -{ - unsigned long nr_pages; - nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; - return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; +out_revert_acct: + (void) account_pipe_buffers(pipe->user, nr_pages, pipe->buffers); + return ret; } /* @@ -1109,28 +1155,9 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) __pipe_lock(pipe); switch (cmd) { - case F_SETPIPE_SZ: { - unsigned int size, nr_pages; - - size = round_pipe_size(arg); - nr_pages = size >> PAGE_SHIFT; - - ret = -EINVAL; - if (!nr_pages) - goto out; - - if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) { - ret = -EPERM; - goto out; - } else if ((too_many_pipe_buffers_hard(pipe->user) || - too_many_pipe_buffers_soft(pipe->user)) && - !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) { - ret = -EPERM; - goto out; - } - ret = pipe_set_size(pipe, nr_pages); + case F_SETPIPE_SZ: + ret = pipe_set_size(pipe, arg); break; - } case F_GETPIPE_SZ: ret = pipe->buffers * PAGE_SIZE; break; @@ -1139,7 +1166,6 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) break; } -out: __pipe_unlock(pipe); return ret; } diff --git a/fs/select.c b/fs/select.c index 8ed9da50896a..3d4f85defeab 100644 --- a/fs/select.c +++ b/fs/select.c @@ -29,6 +29,7 @@ #include <linux/sched/rt.h> #include <linux/freezer.h> #include <net/busy_poll.h> +#include <linux/vmalloc.h> #include <asm/uaccess.h> @@ -554,7 +555,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set_bits fds; void *bits; int ret, max_fds; - unsigned int size; + size_t size, alloc_size; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; @@ -581,7 +582,14 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, if (size > sizeof(stack_fds) / 6) { /* Not enough space in on-stack array; must use kmalloc */ ret = -ENOMEM; - bits = kmalloc(6 * size, GFP_KERNEL); + if (size > (SIZE_MAX / 6)) + goto out_nofds; + + alloc_size = 6 * size; + bits = kmalloc(alloc_size, GFP_KERNEL|__GFP_NOWARN); + if (!bits && alloc_size > PAGE_SIZE) + bits = vmalloc(alloc_size); + if (!bits) goto out_nofds; } @@ -618,7 +626,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, out: if (bits != stack_fds) - kfree(bits); + kvfree(bits); out_nofds: return ret; } diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 94374e435025..2b67bda2021b 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -21,14 +21,14 @@ DEFINE_SPINLOCK(sysfs_symlink_target_lock); void sysfs_warn_dup(struct kernfs_node *parent, const char *name) { - char *buf, *path = NULL; + char *buf; buf = kzalloc(PATH_MAX, GFP_KERNEL); if (buf) - path = kernfs_path(parent, buf, PATH_MAX); + kernfs_path(parent, buf, PATH_MAX); WARN(1, KERN_WARNING "sysfs: cannot create duplicate filename '%s/%s'\n", - path, name); + buf, name); kfree(buf); } diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 584e87e11cb6..26ef1958b65b 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -55,6 +55,8 @@ xfs-y += $(addprefix libxfs/, \ xfs_ag_resv.o \ xfs_rmap.o \ xfs_rmap_btree.o \ + xfs_refcount.o \ + xfs_refcount_btree.o \ xfs_sb.o \ xfs_symlink_remote.o \ xfs_trans_resv.o \ @@ -88,6 +90,7 @@ xfs-y += xfs_aops.o \ xfs_message.o \ xfs_mount.o \ xfs_mru_cache.o \ + xfs_reflink.o \ xfs_stats.o \ xfs_super.o \ xfs_symlink.o \ @@ -100,16 +103,20 @@ xfs-y += xfs_aops.o \ # low-level transaction/log code xfs-y += xfs_log.o \ xfs_log_cil.o \ + xfs_bmap_item.o \ xfs_buf_item.o \ xfs_extfree_item.o \ xfs_icreate_item.o \ xfs_inode_item.o \ + xfs_refcount_item.o \ xfs_rmap_item.o \ xfs_log_recover.o \ xfs_trans_ail.o \ + xfs_trans_bmap.o \ xfs_trans_buf.o \ xfs_trans_extfree.o \ xfs_trans_inode.o \ + xfs_trans_refcount.o \ xfs_trans_rmap.o \ # optional features diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index e3ae0f2b4294..e5ebc3770460 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -38,6 +38,7 @@ #include "xfs_trans_space.h" #include "xfs_rmap_btree.h" #include "xfs_btree.h" +#include "xfs_refcount_btree.h" /* * Per-AG Block Reservations @@ -108,7 +109,9 @@ xfs_ag_resv_critical( trace_xfs_ag_resv_critical(pag, type, avail); /* Critically low if less than 10% or max btree height remains. */ - return avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS; + return XFS_TEST_ERROR(avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS, + pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL, + XFS_RANDOM_AG_RESV_CRITICAL); } /* @@ -228,6 +231,11 @@ xfs_ag_resv_init( if (pag->pag_meta_resv.ar_asked == 0) { ask = used = 0; + error = xfs_refcountbt_calc_reserves(pag->pag_mount, + pag->pag_agno, &ask, &used); + if (error) + goto out; + error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, ask, used); if (error) @@ -238,6 +246,11 @@ xfs_ag_resv_init( if (pag->pag_agfl_resv.ar_asked == 0) { ask = used = 0; + error = xfs_rmapbt_calc_reserves(pag->pag_mount, pag->pag_agno, + &ask, &used); + if (error) + goto out; + error = __xfs_ag_resv_init(pag, XFS_AG_RESV_AGFL, ask, used); if (error) goto out; diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index ca75dc90ebe0..effb64cf714f 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -52,10 +52,23 @@ STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); +unsigned int +xfs_refc_block( + struct xfs_mount *mp) +{ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + return XFS_RMAP_BLOCK(mp) + 1; + if (xfs_sb_version_hasfinobt(&mp->m_sb)) + return XFS_FIBT_BLOCK(mp) + 1; + return XFS_IBT_BLOCK(mp) + 1; +} + xfs_extlen_t xfs_prealloc_blocks( struct xfs_mount *mp) { + if (xfs_sb_version_hasreflink(&mp->m_sb)) + return xfs_refc_block(mp) + 1; if (xfs_sb_version_hasrmapbt(&mp->m_sb)) return XFS_RMAP_BLOCK(mp) + 1; if (xfs_sb_version_hasfinobt(&mp->m_sb)) @@ -115,6 +128,8 @@ xfs_alloc_ag_max_usable( blocks++; /* finobt root block */ if (xfs_sb_version_hasrmapbt(&mp->m_sb)) blocks++; /* rmap root block */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) + blocks++; /* refcount root block */ return mp->m_sb.sb_agblocks - blocks; } @@ -2321,6 +2336,9 @@ xfs_alloc_log_agf( offsetof(xfs_agf_t, agf_btreeblks), offsetof(xfs_agf_t, agf_uuid), offsetof(xfs_agf_t, agf_rmap_blocks), + offsetof(xfs_agf_t, agf_refcount_blocks), + offsetof(xfs_agf_t, agf_refcount_root), + offsetof(xfs_agf_t, agf_refcount_level), /* needed so that we don't log the whole rest of the structure: */ offsetof(xfs_agf_t, agf_spare64), sizeof(xfs_agf_t) @@ -2458,6 +2476,10 @@ xfs_agf_verify( be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length)) return false; + if (xfs_sb_version_hasreflink(&mp->m_sb) && + be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS) + return false; + return true;; } @@ -2578,6 +2600,7 @@ xfs_alloc_read_agf( be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); pag->pagf_levels[XFS_BTNUM_RMAPi] = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]); + pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); spin_lock_init(&pag->pagb_lock); pag->pagb_count = 0; pag->pagb_tree = RB_ROOT; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 9d7f61d36645..c27344cf38e1 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -48,6 +48,7 @@ #include "xfs_filestream.h" #include "xfs_rmap.h" #include "xfs_ag_resv.h" +#include "xfs_refcount.h" kmem_zone_t *xfs_bmap_free_item_zone; @@ -140,7 +141,8 @@ xfs_bmbt_lookup_ge( */ static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) { - return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + return whichfork != XFS_COW_FORK && + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && XFS_IFORK_NEXTENTS(ip, whichfork) > XFS_IFORK_MAXEXT(ip, whichfork); } @@ -150,7 +152,8 @@ static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) */ static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork) { - return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && + return whichfork != XFS_COW_FORK && + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && XFS_IFORK_NEXTENTS(ip, whichfork) <= XFS_IFORK_MAXEXT(ip, whichfork); } @@ -640,6 +643,7 @@ xfs_bmap_btree_to_extents( mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(whichfork != XFS_COW_FORK); ASSERT(ifp->if_flags & XFS_IFEXTENTS); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); rblock = ifp->if_broot; @@ -706,6 +710,7 @@ xfs_bmap_extents_to_btree( xfs_bmbt_ptr_t *pp; /* root block address pointer */ mp = ip->i_mount; + ASSERT(whichfork != XFS_COW_FORK); ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); @@ -748,6 +753,7 @@ xfs_bmap_extents_to_btree( args.type = XFS_ALLOCTYPE_START_BNO; args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); } else if (dfops->dop_low) { +try_another_ag: args.type = XFS_ALLOCTYPE_START_BNO; args.fsbno = *firstblock; } else { @@ -762,6 +768,21 @@ xfs_bmap_extents_to_btree( xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } + + /* + * During a CoW operation, the allocation and bmbt updates occur in + * different transactions. The mapping code tries to put new bmbt + * blocks near extents being mapped, but the only way to guarantee this + * is if the alloc and the mapping happen in a single transaction that + * has a block reservation. That isn't the case here, so if we run out + * of space we'll try again with another AG. + */ + if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) && + args.fsbno == NULLFSBLOCK && + args.type == XFS_ALLOCTYPE_NEAR_BNO) { + dfops->dop_low = true; + goto try_another_ag; + } /* * Allocation can't fail, the space was reserved. */ @@ -837,6 +858,7 @@ xfs_bmap_local_to_extents_empty( { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(whichfork != XFS_COW_FORK); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); ASSERT(ifp->if_bytes == 0); ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); @@ -896,6 +918,7 @@ xfs_bmap_local_to_extents( * file currently fits in an inode. */ if (*firstblock == NULLFSBLOCK) { +try_another_ag: args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino); args.type = XFS_ALLOCTYPE_START_BNO; } else { @@ -908,6 +931,19 @@ xfs_bmap_local_to_extents( if (error) goto done; + /* + * During a CoW operation, the allocation and bmbt updates occur in + * different transactions. The mapping code tries to put new bmbt + * blocks near extents being mapped, but the only way to guarantee this + * is if the alloc and the mapping happen in a single transaction that + * has a block reservation. That isn't the case here, so if we run out + * of space we'll try again with another AG. + */ + if (xfs_sb_version_hasreflink(&ip->i_mount->m_sb) && + args.fsbno == NULLFSBLOCK && + args.type == XFS_ALLOCTYPE_NEAR_BNO) { + goto try_another_ag; + } /* Can't fail, the space was reserved. */ ASSERT(args.fsbno != NULLFSBLOCK); ASSERT(args.len == 1); @@ -1670,7 +1706,8 @@ xfs_bmap_one_block( */ STATIC int /* error */ xfs_bmap_add_extent_delay_real( - struct xfs_bmalloca *bma) + struct xfs_bmalloca *bma, + int whichfork) { struct xfs_bmbt_irec *new = &bma->got; int diff; /* temp value */ @@ -1688,11 +1725,14 @@ xfs_bmap_add_extent_delay_real( xfs_filblks_t temp=0; /* value for da_new calculations */ xfs_filblks_t temp2=0;/* value for da_new calculations */ int tmp_rval; /* partial logging flags */ - int whichfork = XFS_DATA_FORK; struct xfs_mount *mp; + xfs_extnum_t *nextents; mp = bma->ip->i_mount; ifp = XFS_IFORK_PTR(bma->ip, whichfork); + ASSERT(whichfork != XFS_ATTR_FORK); + nextents = (whichfork == XFS_COW_FORK ? &bma->ip->i_cnextents : + &bma->ip->i_d.di_nextents); ASSERT(bma->idx >= 0); ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); @@ -1706,6 +1746,9 @@ xfs_bmap_add_extent_delay_real( #define RIGHT r[1] #define PREV r[2] + if (whichfork == XFS_COW_FORK) + state |= BMAP_COWFORK; + /* * Set up a bunch of variables to make the tests simpler. */ @@ -1792,7 +1835,7 @@ xfs_bmap_add_extent_delay_real( trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_iext_remove(bma->ip, bma->idx + 1, 2, state); - bma->ip->i_d.di_nextents--; + (*nextents)--; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -1894,7 +1937,7 @@ xfs_bmap_add_extent_delay_real( xfs_bmbt_set_startblock(ep, new->br_startblock); trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - bma->ip->i_d.di_nextents++; + (*nextents)++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -1964,7 +2007,7 @@ xfs_bmap_add_extent_delay_real( temp = PREV.br_blockcount - new->br_blockcount; xfs_bmbt_set_blockcount(ep, temp); xfs_iext_insert(bma->ip, bma->idx, 1, new, state); - bma->ip->i_d.di_nextents++; + (*nextents)++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2048,7 +2091,7 @@ xfs_bmap_add_extent_delay_real( trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state); - bma->ip->i_d.di_nextents++; + (*nextents)++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2117,7 +2160,7 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_blockcount = temp2; /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */ xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state); - bma->ip->i_d.di_nextents++; + (*nextents)++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2215,7 +2258,8 @@ xfs_bmap_add_extent_delay_real( xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork); done: - bma->logflags |= rval; + if (whichfork != XFS_COW_FORK) + bma->logflags |= rval; return error; #undef LEFT #undef RIGHT @@ -2759,6 +2803,7 @@ done: STATIC void xfs_bmap_add_extent_hole_delay( xfs_inode_t *ip, /* incore inode pointer */ + int whichfork, xfs_extnum_t *idx, /* extent number to update/insert */ xfs_bmbt_irec_t *new) /* new data to add to file extents */ { @@ -2770,8 +2815,10 @@ xfs_bmap_add_extent_hole_delay( int state; /* state bits, accessed thru macros */ xfs_filblks_t temp=0; /* temp for indirect calculations */ - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + ifp = XFS_IFORK_PTR(ip, whichfork); state = 0; + if (whichfork == XFS_COW_FORK) + state |= BMAP_COWFORK; ASSERT(isnullstartblock(new->br_startblock)); /* @@ -2789,7 +2836,7 @@ xfs_bmap_add_extent_hole_delay( * Check and set flags if the current (right) segment exists. * If it doesn't exist, we're converting the hole at end-of-file. */ - if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { + if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right); @@ -2923,6 +2970,7 @@ xfs_bmap_add_extent_hole_real( ASSERT(!isnullstartblock(new->br_startblock)); ASSERT(!bma->cur || !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + ASSERT(whichfork != XFS_COW_FORK); XFS_STATS_INC(mp, xs_add_exlist); @@ -3648,7 +3696,9 @@ xfs_bmap_btalloc( else if (mp->m_dalign) stripe_align = mp->m_dalign; - if (xfs_alloc_is_userdata(ap->datatype)) + if (ap->flags & XFS_BMAPI_COWFORK) + align = xfs_get_cowextsz_hint(ap->ip); + else if (xfs_alloc_is_userdata(ap->datatype)) align = xfs_get_extsz_hint(ap->ip); if (unlikely(align)) { error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, @@ -3856,7 +3906,8 @@ xfs_bmap_btalloc( ASSERT(nullfb || fb_agno == args.agno || (ap->dfops->dop_low && fb_agno < args.agno)); ap->length = args.len; - ap->ip->i_d.di_nblocks += args.len; + if (!(ap->flags & XFS_BMAPI_COWFORK)) + ap->ip->i_d.di_nblocks += args.len; xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); if (ap->wasdel) ap->ip->i_delayed_blks -= args.len; @@ -3876,6 +3927,63 @@ xfs_bmap_btalloc( } /* + * For a remap operation, just "allocate" an extent at the address that the + * caller passed in, and ensure that the AGFL is the right size. The caller + * will then map the "allocated" extent into the file somewhere. + */ +STATIC int +xfs_bmap_remap_alloc( + struct xfs_bmalloca *ap) +{ + struct xfs_trans *tp = ap->tp; + struct xfs_mount *mp = tp->t_mountp; + xfs_agblock_t bno; + struct xfs_alloc_arg args; + int error; + + /* + * validate that the block number is legal - the enables us to detect + * and handle a silent filesystem corruption rather than crashing. + */ + memset(&args, 0, sizeof(struct xfs_alloc_arg)); + args.tp = ap->tp; + args.mp = ap->tp->t_mountp; + bno = *ap->firstblock; + args.agno = XFS_FSB_TO_AGNO(mp, bno); + args.agbno = XFS_FSB_TO_AGBNO(mp, bno); + if (args.agno >= mp->m_sb.sb_agcount || + args.agbno >= mp->m_sb.sb_agblocks) + return -EFSCORRUPTED; + + /* "Allocate" the extent from the range we passed in. */ + trace_xfs_bmap_remap_alloc(ap->ip, *ap->firstblock, ap->length); + ap->blkno = bno; + ap->ip->i_d.di_nblocks += ap->length; + xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); + + /* Fix the freelist, like a real allocator does. */ + args.datatype = ap->datatype; + args.pag = xfs_perag_get(args.mp, args.agno); + ASSERT(args.pag); + + /* + * The freelist fixing code will decline the allocation if + * the size and shape of the free space doesn't allow for + * allocating the extent and updating all the metadata that + * happens during an allocation. We're remapping, not + * allocating, so skip that check by pretending to be freeing. + */ + error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); + if (error) + goto error0; +error0: + xfs_perag_put(args.pag); + if (error) + trace_xfs_bmap_remap_alloc_error(ap->ip, error, _RET_IP_); + return error; +} + +/* * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. * It figures out where to ask the underlying allocator to put the new extent. */ @@ -3883,6 +3991,8 @@ STATIC int xfs_bmap_alloc( struct xfs_bmalloca *ap) /* bmap alloc argument struct */ { + if (ap->flags & XFS_BMAPI_REMAP) + return xfs_bmap_remap_alloc(ap); if (XFS_IS_REALTIME_INODE(ap->ip) && xfs_alloc_is_userdata(ap->datatype)) return xfs_bmap_rtalloc(ap); @@ -4012,12 +4122,11 @@ xfs_bmapi_read( int error; int eof; int n = 0; - int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; + int whichfork = xfs_bmapi_whichfork(flags); ASSERT(*nmap >= 1); ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE| - XFS_BMAPI_IGSTATE))); + XFS_BMAPI_IGSTATE|XFS_BMAPI_COWFORK))); ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)); if (unlikely(XFS_TEST_ERROR( @@ -4035,6 +4144,16 @@ xfs_bmapi_read( ifp = XFS_IFORK_PTR(ip, whichfork); + /* No CoW fork? Return a hole. */ + if (whichfork == XFS_COW_FORK && !ifp) { + mval->br_startoff = bno; + mval->br_startblock = HOLESTARTBLOCK; + mval->br_blockcount = len; + mval->br_state = XFS_EXT_NORM; + *nmap = 1; + return 0; + } + if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(NULL, ip, whichfork); if (error) @@ -4084,6 +4203,7 @@ xfs_bmapi_read( int xfs_bmapi_reserve_delalloc( struct xfs_inode *ip, + int whichfork, xfs_fileoff_t aoff, xfs_filblks_t len, struct xfs_bmbt_irec *got, @@ -4092,7 +4212,7 @@ xfs_bmapi_reserve_delalloc( int eof) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); xfs_extlen_t alen; xfs_extlen_t indlen; char rt = XFS_IS_REALTIME_INODE(ip); @@ -4104,7 +4224,10 @@ xfs_bmapi_reserve_delalloc( alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); /* Figure out the extent size, adjust alen */ - extsz = xfs_get_extsz_hint(ip); + if (whichfork == XFS_COW_FORK) + extsz = xfs_get_cowextsz_hint(ip); + else + extsz = xfs_get_extsz_hint(ip); if (extsz) { error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof, 1, 0, &aoff, &alen); @@ -4151,7 +4274,7 @@ xfs_bmapi_reserve_delalloc( got->br_startblock = nullstartblock(indlen); got->br_blockcount = alen; got->br_state = XFS_EXT_NORM; - xfs_bmap_add_extent_hole_delay(ip, lastx, got); + xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got); /* * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay @@ -4182,8 +4305,7 @@ xfs_bmapi_allocate( struct xfs_bmalloca *bma) { struct xfs_mount *mp = bma->ip->i_mount; - int whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; + int whichfork = xfs_bmapi_whichfork(bma->flags); struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); int tmp_logflags = 0; int error; @@ -4278,7 +4400,7 @@ xfs_bmapi_allocate( bma->got.br_state = XFS_EXT_UNWRITTEN; if (bma->wasdel) - error = xfs_bmap_add_extent_delay_real(bma); + error = xfs_bmap_add_extent_delay_real(bma, whichfork); else error = xfs_bmap_add_extent_hole_real(bma, whichfork); @@ -4308,8 +4430,7 @@ xfs_bmapi_convert_unwritten( xfs_filblks_t len, int flags) { - int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; + int whichfork = xfs_bmapi_whichfork(flags); struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); int tmp_logflags = 0; int error; @@ -4325,6 +4446,8 @@ xfs_bmapi_convert_unwritten( (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) return 0; + ASSERT(whichfork != XFS_COW_FORK); + /* * Modify (by adding) the state flag, if writing. */ @@ -4431,8 +4554,7 @@ xfs_bmapi_write( orig_mval = mval; orig_nmap = *nmap; #endif - whichfork = (flags & XFS_BMAPI_ATTRFORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; + whichfork = xfs_bmapi_whichfork(flags); ASSERT(*nmap >= 1); ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); @@ -4441,6 +4563,11 @@ xfs_bmapi_write( ASSERT(len > 0); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(!(flags & XFS_BMAPI_REMAP) || whichfork == XFS_DATA_FORK); + ASSERT(!(flags & XFS_BMAPI_PREALLOC) || !(flags & XFS_BMAPI_REMAP)); + ASSERT(!(flags & XFS_BMAPI_CONVERT) || !(flags & XFS_BMAPI_REMAP)); + ASSERT(!(flags & XFS_BMAPI_PREALLOC) || whichfork != XFS_COW_FORK); + ASSERT(!(flags & XFS_BMAPI_CONVERT) || whichfork != XFS_COW_FORK); /* zeroing is for currently only for data extents, not metadata */ ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) != @@ -4502,6 +4629,14 @@ xfs_bmapi_write( wasdelay = !inhole && isnullstartblock(bma.got.br_startblock); /* + * Make sure we only reflink into a hole. + */ + if (flags & XFS_BMAPI_REMAP) + ASSERT(inhole); + if (flags & XFS_BMAPI_COWFORK) + ASSERT(!inhole); + + /* * First, deal with the hole before the allocated space * that we found, if any. */ @@ -4531,6 +4666,17 @@ xfs_bmapi_write( goto error0; if (bma.blkno == NULLFSBLOCK) break; + + /* + * If this is a CoW allocation, record the data in + * the refcount btree for orphan recovery. + */ + if (whichfork == XFS_COW_FORK) { + error = xfs_refcount_alloc_cow_extent(mp, dfops, + bma.blkno, bma.length); + if (error) + goto error0; + } } /* Deal with the allocated space we found. */ @@ -4696,7 +4842,8 @@ xfs_bmap_del_extent( xfs_btree_cur_t *cur, /* if null, not a btree */ xfs_bmbt_irec_t *del, /* data to remove from extents */ int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ + int whichfork, /* data or attr fork */ + int bflags) /* bmapi flags */ { xfs_filblks_t da_new; /* new delay-alloc indirect blocks */ xfs_filblks_t da_old; /* old delay-alloc indirect blocks */ @@ -4725,6 +4872,8 @@ xfs_bmap_del_extent( if (whichfork == XFS_ATTR_FORK) state |= BMAP_ATTRFORK; + else if (whichfork == XFS_COW_FORK) + state |= BMAP_COWFORK; ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT((*idx >= 0) && (*idx < ifp->if_bytes / @@ -4805,6 +4954,7 @@ xfs_bmap_del_extent( /* * Matches the whole extent. Delete the entry. */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); xfs_iext_remove(ip, *idx, 1, whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); --*idx; @@ -4988,9 +5138,16 @@ xfs_bmap_del_extent( /* * If we need to, add to list of extents to delete. */ - if (do_fx) - xfs_bmap_add_free(mp, dfops, del->br_startblock, - del->br_blockcount, NULL); + if (do_fx && !(bflags & XFS_BMAPI_REMAP)) { + if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { + error = xfs_refcount_decrease_extent(mp, dfops, del); + if (error) + goto done; + } else + xfs_bmap_add_free(mp, dfops, del->br_startblock, + del->br_blockcount, NULL); + } + /* * Adjust inode # blocks in the file. */ @@ -4999,7 +5156,7 @@ xfs_bmap_del_extent( /* * Adjust quota data. */ - if (qfield) + if (qfield && !(bflags & XFS_BMAPI_REMAP)) xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks); /* @@ -5014,6 +5171,175 @@ done: return error; } +/* Remove an extent from the CoW fork. Similar to xfs_bmap_del_extent. */ +int +xfs_bunmapi_cow( + struct xfs_inode *ip, + struct xfs_bmbt_irec *del) +{ + xfs_filblks_t da_new; + xfs_filblks_t da_old; + xfs_fsblock_t del_endblock = 0; + xfs_fileoff_t del_endoff; + int delay; + struct xfs_bmbt_rec_host *ep; + int error; + struct xfs_bmbt_irec got; + xfs_fileoff_t got_endoff; + struct xfs_ifork *ifp; + struct xfs_mount *mp; + xfs_filblks_t nblks; + struct xfs_bmbt_irec new; + /* REFERENCED */ + uint qfield; + xfs_filblks_t temp; + xfs_filblks_t temp2; + int state = BMAP_COWFORK; + int eof; + xfs_extnum_t eidx; + + mp = ip->i_mount; + XFS_STATS_INC(mp, xs_del_exlist); + + ep = xfs_bmap_search_extents(ip, del->br_startoff, XFS_COW_FORK, &eof, + &eidx, &got, &new); + + ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); ifp = ifp; + ASSERT((eidx >= 0) && (eidx < ifp->if_bytes / + (uint)sizeof(xfs_bmbt_rec_t))); + ASSERT(del->br_blockcount > 0); + ASSERT(got.br_startoff <= del->br_startoff); + del_endoff = del->br_startoff + del->br_blockcount; + got_endoff = got.br_startoff + got.br_blockcount; + ASSERT(got_endoff >= del_endoff); + delay = isnullstartblock(got.br_startblock); + ASSERT(isnullstartblock(del->br_startblock) == delay); + qfield = 0; + error = 0; + /* + * If deleting a real allocation, must free up the disk space. + */ + if (!delay) { + nblks = del->br_blockcount; + qfield = XFS_TRANS_DQ_BCOUNT; + /* + * Set up del_endblock and cur for later. + */ + del_endblock = del->br_startblock + del->br_blockcount; + da_old = da_new = 0; + } else { + da_old = startblockval(got.br_startblock); + da_new = 0; + nblks = 0; + } + qfield = qfield; + nblks = nblks; + + /* + * Set flag value to use in switch statement. + * Left-contig is 2, right-contig is 1. + */ + switch (((got.br_startoff == del->br_startoff) << 1) | + (got_endoff == del_endoff)) { + case 3: + /* + * Matches the whole extent. Delete the entry. + */ + xfs_iext_remove(ip, eidx, 1, BMAP_COWFORK); + --eidx; + break; + + case 2: + /* + * Deleting the first part of the extent. + */ + trace_xfs_bmap_pre_update(ip, eidx, state, _THIS_IP_); + xfs_bmbt_set_startoff(ep, del_endoff); + temp = got.br_blockcount - del->br_blockcount; + xfs_bmbt_set_blockcount(ep, temp); + if (delay) { + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_old); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_); + da_new = temp; + break; + } + xfs_bmbt_set_startblock(ep, del_endblock); + trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_); + break; + + case 1: + /* + * Deleting the last part of the extent. + */ + temp = got.br_blockcount - del->br_blockcount; + trace_xfs_bmap_pre_update(ip, eidx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); + if (delay) { + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_old); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_); + da_new = temp; + break; + } + trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_); + break; + + case 0: + /* + * Deleting the middle of the extent. + */ + temp = del->br_startoff - got.br_startoff; + trace_xfs_bmap_pre_update(ip, eidx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); + new.br_startoff = del_endoff; + temp2 = got_endoff - del_endoff; + new.br_blockcount = temp2; + new.br_state = got.br_state; + if (!delay) { + new.br_startblock = del_endblock; + } else { + temp = xfs_bmap_worst_indlen(ip, temp); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + temp2 = xfs_bmap_worst_indlen(ip, temp2); + new.br_startblock = nullstartblock((int)temp2); + da_new = temp + temp2; + while (da_new > da_old) { + if (temp) { + temp--; + da_new--; + xfs_bmbt_set_startblock(ep, + nullstartblock((int)temp)); + } + if (da_new == da_old) + break; + if (temp2) { + temp2--; + da_new--; + new.br_startblock = + nullstartblock((int)temp2); + } + } + } + trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_); + xfs_iext_insert(ip, eidx + 1, 1, &new, state); + ++eidx; + break; + } + + /* + * Account for change in delayed indirect blocks. + * Nothing to do for disk quota accounting here. + */ + ASSERT(da_old >= da_new); + if (da_old > da_new) + xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), false); + + return error; +} + /* * Unmap (remove) blocks from a file. * If nexts is nonzero then the number of extents to remove is limited to @@ -5021,17 +5347,16 @@ done: * *done is set. */ int /* error */ -xfs_bunmapi( +__xfs_bunmapi( xfs_trans_t *tp, /* transaction pointer */ struct xfs_inode *ip, /* incore inode */ xfs_fileoff_t bno, /* starting offset to unmap */ - xfs_filblks_t len, /* length to unmap in file */ + xfs_filblks_t *rlen, /* i/o: amount remaining */ int flags, /* misc flags */ xfs_extnum_t nexts, /* number of extents max */ xfs_fsblock_t *firstblock, /* first allocated block controls a.g. for allocs */ - struct xfs_defer_ops *dfops, /* i/o: list extents to free */ - int *done) /* set if not done yet */ + struct xfs_defer_ops *dfops) /* i/o: deferred updates */ { xfs_btree_cur_t *cur; /* bmap btree cursor */ xfs_bmbt_irec_t del; /* extent being deleted */ @@ -5053,11 +5378,12 @@ xfs_bunmapi( int wasdel; /* was a delayed alloc extent */ int whichfork; /* data or attribute fork */ xfs_fsblock_t sum; + xfs_filblks_t len = *rlen; /* length to unmap in file */ trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_); - whichfork = (flags & XFS_BMAPI_ATTRFORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; + whichfork = xfs_bmapi_whichfork(flags); + ASSERT(whichfork != XFS_COW_FORK); ifp = XFS_IFORK_PTR(ip, whichfork); if (unlikely( XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && @@ -5079,7 +5405,7 @@ xfs_bunmapi( return error; nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); if (nextents == 0) { - *done = 1; + *rlen = 0; return 0; } XFS_STATS_INC(mp, xs_blk_unmap); @@ -5324,7 +5650,7 @@ xfs_bunmapi( cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL; error = xfs_bmap_del_extent(ip, tp, &lastx, dfops, cur, &del, - &tmp_logflags, whichfork); + &tmp_logflags, whichfork, flags); logflags |= tmp_logflags; if (error) goto error0; @@ -5350,7 +5676,10 @@ nodelete: extno++; } } - *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0; + if (bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0) + *rlen = 0; + else + *rlen = bno - start + 1; /* * Convert to a btree if necessary. @@ -5406,6 +5735,27 @@ error0: return error; } +/* Unmap a range of a file. */ +int +xfs_bunmapi( + xfs_trans_t *tp, + struct xfs_inode *ip, + xfs_fileoff_t bno, + xfs_filblks_t len, + int flags, + xfs_extnum_t nexts, + xfs_fsblock_t *firstblock, + struct xfs_defer_ops *dfops, + int *done) +{ + int error; + + error = __xfs_bunmapi(tp, ip, bno, &len, flags, nexts, firstblock, + dfops); + *done = (len == 0); + return error; +} + /* * Determine whether an extent shift can be accomplished by a merge with the * extent that precedes the target hole of the shift. @@ -5985,3 +6335,146 @@ out: xfs_trans_cancel(tp); return error; } + +/* Deferred mapping is only for real extents in the data fork. */ +static bool +xfs_bmap_is_update_needed( + struct xfs_bmbt_irec *bmap) +{ + return bmap->br_startblock != HOLESTARTBLOCK && + bmap->br_startblock != DELAYSTARTBLOCK; +} + +/* Record a bmap intent. */ +static int +__xfs_bmap_add( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + enum xfs_bmap_intent_type type, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *bmap) +{ + int error; + struct xfs_bmap_intent *bi; + + trace_xfs_bmap_defer(mp, + XFS_FSB_TO_AGNO(mp, bmap->br_startblock), + type, + XFS_FSB_TO_AGBNO(mp, bmap->br_startblock), + ip->i_ino, whichfork, + bmap->br_startoff, + bmap->br_blockcount, + bmap->br_state); + + bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_SLEEP | KM_NOFS); + INIT_LIST_HEAD(&bi->bi_list); + bi->bi_type = type; + bi->bi_owner = ip; + bi->bi_whichfork = whichfork; + bi->bi_bmap = *bmap; + + error = xfs_defer_join(dfops, bi->bi_owner); + if (error) { + kmem_free(bi); + return error; + } + + xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_BMAP, &bi->bi_list); + return 0; +} + +/* Map an extent into a file. */ +int +xfs_bmap_map_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + struct xfs_inode *ip, + struct xfs_bmbt_irec *PREV) +{ + if (!xfs_bmap_is_update_needed(PREV)) + return 0; + + return __xfs_bmap_add(mp, dfops, XFS_BMAP_MAP, ip, + XFS_DATA_FORK, PREV); +} + +/* Unmap an extent out of a file. */ +int +xfs_bmap_unmap_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + struct xfs_inode *ip, + struct xfs_bmbt_irec *PREV) +{ + if (!xfs_bmap_is_update_needed(PREV)) + return 0; + + return __xfs_bmap_add(mp, dfops, XFS_BMAP_UNMAP, ip, + XFS_DATA_FORK, PREV); +} + +/* + * Process one of the deferred bmap operations. We pass back the + * btree cursor to maintain our lock on the bmapbt between calls. + */ +int +xfs_bmap_finish_one( + struct xfs_trans *tp, + struct xfs_defer_ops *dfops, + struct xfs_inode *ip, + enum xfs_bmap_intent_type type, + int whichfork, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount, + xfs_exntst_t state) +{ + struct xfs_bmbt_irec bmap; + int nimaps = 1; + xfs_fsblock_t firstfsb; + int flags = XFS_BMAPI_REMAP; + int done; + int error = 0; + + bmap.br_startblock = startblock; + bmap.br_startoff = startoff; + bmap.br_blockcount = blockcount; + bmap.br_state = state; + + trace_xfs_bmap_deferred(tp->t_mountp, + XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type, + XFS_FSB_TO_AGBNO(tp->t_mountp, startblock), + ip->i_ino, whichfork, startoff, blockcount, state); + + if (whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK) + return -EFSCORRUPTED; + if (whichfork == XFS_ATTR_FORK) + flags |= XFS_BMAPI_ATTRFORK; + + if (XFS_TEST_ERROR(false, tp->t_mountp, + XFS_ERRTAG_BMAP_FINISH_ONE, + XFS_RANDOM_BMAP_FINISH_ONE)) + return -EIO; + + switch (type) { + case XFS_BMAP_MAP: + firstfsb = bmap.br_startblock; + error = xfs_bmapi_write(tp, ip, bmap.br_startoff, + bmap.br_blockcount, flags, &firstfsb, + bmap.br_blockcount, &bmap, &nimaps, + dfops); + break; + case XFS_BMAP_UNMAP: + error = xfs_bunmapi(tp, ip, bmap.br_startoff, + bmap.br_blockcount, flags, 1, &firstfsb, + dfops, &done); + ASSERT(done); + break; + default: + ASSERT(0); + error = -EFSCORRUPTED; + } + + return error; +} diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 8395f6e8cf7d..f97db7132564 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -97,6 +97,19 @@ struct xfs_extent_free_item */ #define XFS_BMAPI_ZERO 0x080 +/* + * Map the inode offset to the block given in ap->firstblock. Primarily + * used for reflink. The range must be in a hole, and this flag cannot be + * turned on with PREALLOC or CONVERT, and cannot be used on the attr fork. + * + * For bunmapi, this flag unmaps the range without adjusting quota, reducing + * refcount, or freeing the blocks. + */ +#define XFS_BMAPI_REMAP 0x100 + +/* Map something in the CoW fork. */ +#define XFS_BMAPI_COWFORK 0x200 + #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ @@ -105,12 +118,24 @@ struct xfs_extent_free_item { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ { XFS_BMAPI_CONTIG, "CONTIG" }, \ { XFS_BMAPI_CONVERT, "CONVERT" }, \ - { XFS_BMAPI_ZERO, "ZERO" } + { XFS_BMAPI_ZERO, "ZERO" }, \ + { XFS_BMAPI_REMAP, "REMAP" }, \ + { XFS_BMAPI_COWFORK, "COWFORK" } static inline int xfs_bmapi_aflag(int w) { - return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0); + return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : + (w == XFS_COW_FORK ? XFS_BMAPI_COWFORK : 0)); +} + +static inline int xfs_bmapi_whichfork(int bmapi_flags) +{ + if (bmapi_flags & XFS_BMAPI_COWFORK) + return XFS_COW_FORK; + else if (bmapi_flags & XFS_BMAPI_ATTRFORK) + return XFS_ATTR_FORK; + return XFS_DATA_FORK; } /* @@ -131,13 +156,15 @@ static inline int xfs_bmapi_aflag(int w) #define BMAP_LEFT_VALID (1 << 6) #define BMAP_RIGHT_VALID (1 << 7) #define BMAP_ATTRFORK (1 << 8) +#define BMAP_COWFORK (1 << 9) #define XFS_BMAP_EXT_FLAGS \ { BMAP_LEFT_CONTIG, "LC" }, \ { BMAP_RIGHT_CONTIG, "RC" }, \ { BMAP_LEFT_FILLING, "LF" }, \ { BMAP_RIGHT_FILLING, "RF" }, \ - { BMAP_ATTRFORK, "ATTR" } + { BMAP_ATTRFORK, "ATTR" }, \ + { BMAP_COWFORK, "COW" } /* @@ -186,10 +213,15 @@ int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fsblock_t *firstblock, xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap, struct xfs_defer_ops *dfops); +int __xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t bno, xfs_filblks_t *rlen, int flags, + xfs_extnum_t nexts, xfs_fsblock_t *firstblock, + struct xfs_defer_ops *dfops); int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, int flags, xfs_extnum_t nexts, xfs_fsblock_t *firstblock, struct xfs_defer_ops *dfops, int *done); +int xfs_bunmapi_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *del); int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx, xfs_extnum_t num); uint xfs_default_attroffset(struct xfs_inode *ip); @@ -203,8 +235,31 @@ struct xfs_bmbt_rec_host * xfs_bmap_search_extents(struct xfs_inode *ip, xfs_fileoff_t bno, int fork, int *eofp, xfs_extnum_t *lastxp, struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp); -int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, xfs_fileoff_t aoff, - xfs_filblks_t len, struct xfs_bmbt_irec *got, - struct xfs_bmbt_irec *prev, xfs_extnum_t *lastx, int eof); +int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, + xfs_fileoff_t aoff, xfs_filblks_t len, + struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *prev, + xfs_extnum_t *lastx, int eof); + +enum xfs_bmap_intent_type { + XFS_BMAP_MAP = 1, + XFS_BMAP_UNMAP, +}; + +struct xfs_bmap_intent { + struct list_head bi_list; + enum xfs_bmap_intent_type bi_type; + struct xfs_inode *bi_owner; + int bi_whichfork; + struct xfs_bmbt_irec bi_bmap; +}; + +int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_defer_ops *dfops, + struct xfs_inode *ip, enum xfs_bmap_intent_type type, + int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, + xfs_filblks_t blockcount, xfs_exntst_t state); +int xfs_bmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, + struct xfs_inode *ip, struct xfs_bmbt_irec *imap); +int xfs_bmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, + struct xfs_inode *ip, struct xfs_bmbt_irec *imap); #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index cd85274e810c..8007d2ba9aef 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -453,6 +453,7 @@ xfs_bmbt_alloc_block( if (args.fsbno == NULLFSBLOCK) { args.fsbno = be64_to_cpu(start->l); +try_another_ag: args.type = XFS_ALLOCTYPE_START_BNO; /* * Make sure there is sufficient room left in the AG to @@ -482,6 +483,22 @@ xfs_bmbt_alloc_block( if (error) goto error0; + /* + * During a CoW operation, the allocation and bmbt updates occur in + * different transactions. The mapping code tries to put new bmbt + * blocks near extents being mapped, but the only way to guarantee this + * is if the alloc and the mapping happen in a single transaction that + * has a block reservation. That isn't the case here, so if we run out + * of space we'll try again with another AG. + */ + if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) && + args.fsbno == NULLFSBLOCK && + args.type == XFS_ALLOCTYPE_NEAR_BNO) { + cur->bc_private.b.dfops->dop_low = true; + args.fsbno = cur->bc_private.b.firstblock; + goto try_another_ag; + } + if (args.fsbno == NULLFSBLOCK && args.minleft) { /* * Could not find an AG with enough free space to satisfy @@ -777,6 +794,7 @@ xfs_bmbt_init_cursor( { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_btree_cur *cur; + ASSERT(whichfork != XFS_COW_FORK); cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index aa1752f918b8..5c8e6f2ce44f 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -45,9 +45,10 @@ kmem_zone_t *xfs_btree_cur_zone; */ static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = { { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC, - XFS_FIBT_MAGIC }, + XFS_FIBT_MAGIC, 0 }, { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC, - XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC } + XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC, + XFS_REFC_CRC_MAGIC } }; #define xfs_btree_magic(cur) \ xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum] @@ -1216,6 +1217,9 @@ xfs_btree_set_refs( case XFS_BTNUM_RMAP: xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF); break; + case XFS_BTNUM_REFC: + xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF); + break; default: ASSERT(0); } diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 3f8556a5c2ad..c2b01d1c79ee 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -49,6 +49,7 @@ union xfs_btree_key { struct xfs_inobt_key inobt; struct xfs_rmap_key rmap; struct xfs_rmap_key __rmap_bigkey[2]; + struct xfs_refcount_key refc; }; union xfs_btree_rec { @@ -57,6 +58,7 @@ union xfs_btree_rec { struct xfs_alloc_rec alloc; struct xfs_inobt_rec inobt; struct xfs_rmap_rec rmap; + struct xfs_refcount_rec refc; }; /* @@ -72,6 +74,7 @@ union xfs_btree_rec { #define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi) #define XFS_BTNUM_FINO ((xfs_btnum_t)XFS_BTNUM_FINOi) #define XFS_BTNUM_RMAP ((xfs_btnum_t)XFS_BTNUM_RMAPi) +#define XFS_BTNUM_REFC ((xfs_btnum_t)XFS_BTNUM_REFCi) /* * For logging record fields. @@ -105,6 +108,7 @@ do { \ case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \ case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \ case XFS_BTNUM_RMAP: __XFS_BTREE_STATS_INC(__mp, rmap, stat); break; \ + case XFS_BTNUM_REFC: __XFS_BTREE_STATS_INC(__mp, refcbt, stat); break; \ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ } \ } while (0) @@ -127,6 +131,8 @@ do { \ __XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \ case XFS_BTNUM_RMAP: \ __XFS_BTREE_STATS_ADD(__mp, rmap, stat, val); break; \ + case XFS_BTNUM_REFC: \ + __XFS_BTREE_STATS_ADD(__mp, refcbt, stat, val); break; \ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ } \ } while (0) @@ -217,6 +223,15 @@ union xfs_btree_irec { struct xfs_bmbt_irec b; struct xfs_inobt_rec_incore i; struct xfs_rmap_irec r; + struct xfs_refcount_irec rc; +}; + +/* Per-AG btree private information. */ +union xfs_btree_cur_private { + struct { + unsigned long nr_ops; /* # record updates */ + int shape_changes; /* # of extent splits */ + } refc; }; /* @@ -243,6 +258,7 @@ typedef struct xfs_btree_cur struct xfs_buf *agbp; /* agf/agi buffer pointer */ struct xfs_defer_ops *dfops; /* deferred updates */ xfs_agnumber_t agno; /* ag number */ + union xfs_btree_cur_private priv; } a; struct { /* needed for BMAP */ struct xfs_inode *ip; /* pointer to our inode */ diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index e96533d178cf..f6e93ef0bffe 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -51,6 +51,8 @@ struct xfs_defer_pending { * find all the space it needs. */ enum xfs_defer_ops_type { + XFS_DEFER_OPS_TYPE_BMAP, + XFS_DEFER_OPS_TYPE_REFCOUNT, XFS_DEFER_OPS_TYPE_RMAP, XFS_DEFER_OPS_TYPE_FREE, XFS_DEFER_OPS_TYPE_MAX, diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 270fb5cf4fa1..f6547fc5e016 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -456,9 +456,11 @@ xfs_sb_has_compat_feature( #define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */ #define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */ +#define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflinked files */ #define XFS_SB_FEAT_RO_COMPAT_ALL \ (XFS_SB_FEAT_RO_COMPAT_FINOBT | \ - XFS_SB_FEAT_RO_COMPAT_RMAPBT) + XFS_SB_FEAT_RO_COMPAT_RMAPBT | \ + XFS_SB_FEAT_RO_COMPAT_REFLINK) #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL static inline bool xfs_sb_has_ro_compat_feature( @@ -546,6 +548,12 @@ static inline bool xfs_sb_version_hasrmapbt(struct xfs_sb *sbp) (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT); } +static inline bool xfs_sb_version_hasreflink(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK); +} + /* * end of superblock version macros */ @@ -641,14 +649,17 @@ typedef struct xfs_agf { uuid_t agf_uuid; /* uuid of filesystem */ __be32 agf_rmap_blocks; /* rmapbt blocks used */ - __be32 agf_padding; /* padding */ + __be32 agf_refcount_blocks; /* refcountbt blocks used */ + + __be32 agf_refcount_root; /* refcount tree root block */ + __be32 agf_refcount_level; /* refcount btree levels */ /* * reserve some contiguous space for future logged fields before we add * the unlogged fields. This makes the range logging via flags and * structure offsets much simpler. */ - __be64 agf_spare64[15]; + __be64 agf_spare64[14]; /* unlogged fields, written during buffer writeback. */ __be64 agf_lsn; /* last write sequence */ @@ -674,8 +685,11 @@ typedef struct xfs_agf { #define XFS_AGF_BTREEBLKS 0x00000800 #define XFS_AGF_UUID 0x00001000 #define XFS_AGF_RMAP_BLOCKS 0x00002000 -#define XFS_AGF_SPARE64 0x00004000 -#define XFS_AGF_NUM_BITS 15 +#define XFS_AGF_REFCOUNT_BLOCKS 0x00004000 +#define XFS_AGF_REFCOUNT_ROOT 0x00008000 +#define XFS_AGF_REFCOUNT_LEVEL 0x00010000 +#define XFS_AGF_SPARE64 0x00020000 +#define XFS_AGF_NUM_BITS 18 #define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) #define XFS_AGF_FLAGS \ @@ -693,6 +707,9 @@ typedef struct xfs_agf { { XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \ { XFS_AGF_UUID, "UUID" }, \ { XFS_AGF_RMAP_BLOCKS, "RMAP_BLOCKS" }, \ + { XFS_AGF_REFCOUNT_BLOCKS, "REFCOUNT_BLOCKS" }, \ + { XFS_AGF_REFCOUNT_ROOT, "REFCOUNT_ROOT" }, \ + { XFS_AGF_REFCOUNT_LEVEL, "REFCOUNT_LEVEL" }, \ { XFS_AGF_SPARE64, "SPARE64" } /* disk block (xfs_daddr_t) in the AG */ @@ -885,7 +902,8 @@ typedef struct xfs_dinode { __be64 di_changecount; /* number of attribute changes */ __be64 di_lsn; /* flush sequence */ __be64 di_flags2; /* more random flags */ - __u8 di_pad2[16]; /* more padding for future expansion */ + __be32 di_cowextsize; /* basic cow extent size for file */ + __u8 di_pad2[12]; /* more padding for future expansion */ /* fields only written to during inode creation */ xfs_timestamp_t di_crtime; /* time created */ @@ -1041,9 +1059,14 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) * 16 bits of the XFS_XFLAG_s range. */ #define XFS_DIFLAG2_DAX_BIT 0 /* use DAX for this inode */ +#define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */ +#define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */ #define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) +#define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT) +#define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT) -#define XFS_DIFLAG2_ANY (XFS_DIFLAG2_DAX) +#define XFS_DIFLAG2_ANY \ + (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE) /* * Inode number format: @@ -1353,7 +1376,9 @@ struct xfs_owner_info { #define XFS_RMAP_OWN_AG (-5ULL) /* AG freespace btree blocks */ #define XFS_RMAP_OWN_INOBT (-6ULL) /* Inode btree blocks */ #define XFS_RMAP_OWN_INODES (-7ULL) /* Inode chunk */ -#define XFS_RMAP_OWN_MIN (-8ULL) /* guard */ +#define XFS_RMAP_OWN_REFC (-8ULL) /* refcount tree */ +#define XFS_RMAP_OWN_COW (-9ULL) /* cow allocations */ +#define XFS_RMAP_OWN_MIN (-10ULL) /* guard */ #define XFS_RMAP_NON_INODE_OWNER(owner) (!!((owner) & (1ULL << 63))) @@ -1434,6 +1459,62 @@ typedef __be32 xfs_rmap_ptr_t; XFS_IBT_BLOCK(mp) + 1) /* + * Reference Count Btree format definitions + * + */ +#define XFS_REFC_CRC_MAGIC 0x52334643 /* 'R3FC' */ + +unsigned int xfs_refc_block(struct xfs_mount *mp); + +/* + * Data record/key structure + * + * Each record associates a range of physical blocks (starting at + * rc_startblock and ending rc_blockcount blocks later) with a reference + * count (rc_refcount). Extents that are being used to stage a copy on + * write (CoW) operation are recorded in the refcount btree with a + * refcount of 1. All other records must have a refcount > 1 and must + * track an extent mapped only by file data forks. + * + * Extents with a single owner (attributes, metadata, non-shared file + * data) are not tracked here. Free space is also not tracked here. + * This is consistent with pre-reflink XFS. + */ + +/* + * Extents that are being used to stage a copy on write are stored + * in the refcount btree with a refcount of 1 and the upper bit set + * on the startblock. This speeds up mount time deletion of stale + * staging extents because they're all at the right side of the tree. + */ +#define XFS_REFC_COW_START ((xfs_agblock_t)(1U << 31)) +#define REFCNTBT_COWFLAG_BITLEN 1 +#define REFCNTBT_AGBLOCK_BITLEN 31 + +struct xfs_refcount_rec { + __be32 rc_startblock; /* starting block number */ + __be32 rc_blockcount; /* count of blocks */ + __be32 rc_refcount; /* number of inodes linked here */ +}; + +struct xfs_refcount_key { + __be32 rc_startblock; /* starting block number */ +}; + +struct xfs_refcount_irec { + xfs_agblock_t rc_startblock; /* starting block number */ + xfs_extlen_t rc_blockcount; /* count of free blocks */ + xfs_nlink_t rc_refcount; /* number of inodes linked here */ +}; + +#define MAXREFCOUNT ((xfs_nlink_t)~0U) +#define MAXREFCEXTLEN ((xfs_extlen_t)~0U) + +/* btree pointer type */ +typedef __be32 xfs_refcount_ptr_t; + + +/* * BMAP Btree format definitions * * This includes both the root block definition that sits inside an inode fork diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 79455058b752..b72dc821d78b 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -81,14 +81,16 @@ struct getbmapx { #define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */ #define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */ #define BMV_IF_NO_HOLES 0x10 /* Do not return holes */ +#define BMV_IF_COWFORK 0x20 /* return CoW fork rather than data */ #define BMV_IF_VALID \ (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC| \ - BMV_IF_DELALLOC|BMV_IF_NO_HOLES) + BMV_IF_DELALLOC|BMV_IF_NO_HOLES|BMV_IF_COWFORK) /* bmv_oflags values - returned for each non-header segment */ #define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */ #define BMV_OF_DELALLOC 0x2 /* segment = delayed allocation */ #define BMV_OF_LAST 0x4 /* segment is the last in the file */ +#define BMV_OF_SHARED 0x8 /* segment shared with another file */ /* * Structure for XFS_IOC_FSSETDM. @@ -206,7 +208,8 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */ #define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */ #define XFS_FSOP_GEOM_FLAGS_SPINODES 0x40000 /* sparse inode chunks */ -#define XFS_FSOP_GEOM_FLAGS_RMAPBT 0x80000 /* Reverse mapping btree */ +#define XFS_FSOP_GEOM_FLAGS_RMAPBT 0x80000 /* reverse mapping btree */ +#define XFS_FSOP_GEOM_FLAGS_REFLINK 0x100000 /* files can share blocks */ /* * Minimum and maximum sizes need for growth checks. @@ -275,7 +278,8 @@ typedef struct xfs_bstat { #define bs_projid bs_projid_lo /* (previously just bs_projid) */ __u16 bs_forkoff; /* inode fork offset in bytes */ __u16 bs_projid_hi; /* higher part of project id */ - unsigned char bs_pad[10]; /* pad space, unused */ + unsigned char bs_pad[6]; /* pad space, unused */ + __u32 bs_cowextsize; /* cow extent size */ __u32 bs_dmevmask; /* DMIG event mask */ __u16 bs_dmstate; /* DMIG state info */ __u16 bs_aextents; /* attribute number of extents */ diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 4b9769e23c83..8de9a3a29589 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -256,6 +256,7 @@ xfs_inode_from_disk( to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec); to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec); to->di_flags2 = be64_to_cpu(from->di_flags2); + to->di_cowextsize = be32_to_cpu(from->di_cowextsize); } } @@ -305,7 +306,7 @@ xfs_inode_to_disk( to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); to->di_flags2 = cpu_to_be64(from->di_flags2); - + to->di_cowextsize = cpu_to_be32(from->di_cowextsize); to->di_ino = cpu_to_be64(ip->i_ino); to->di_lsn = cpu_to_be64(lsn); memset(to->di_pad2, 0, sizeof(to->di_pad2)); @@ -357,6 +358,7 @@ xfs_log_dinode_to_disk( to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); to->di_flags2 = cpu_to_be64(from->di_flags2); + to->di_cowextsize = cpu_to_be32(from->di_cowextsize); to->di_ino = cpu_to_be64(from->di_ino); to->di_lsn = cpu_to_be64(from->di_lsn); memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); @@ -373,6 +375,9 @@ xfs_dinode_verify( struct xfs_inode *ip, struct xfs_dinode *dip) { + uint16_t flags; + uint64_t flags2; + if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) return false; @@ -389,6 +394,23 @@ xfs_dinode_verify( return false; if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid)) return false; + + flags = be16_to_cpu(dip->di_flags); + flags2 = be64_to_cpu(dip->di_flags2); + + /* don't allow reflink/cowextsize if we don't have reflink */ + if ((flags2 & (XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)) && + !xfs_sb_version_hasreflink(&mp->m_sb)) + return false; + + /* don't let reflink and realtime mix */ + if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME)) + return false; + + /* don't let reflink and dax mix */ + if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags2 & XFS_DIFLAG2_DAX)) + return false; + return true; } diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 7c4dd321b215..62d9d4681c8c 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -47,6 +47,7 @@ struct xfs_icdinode { __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ __uint64_t di_flags2; /* more random flags */ + __uint32_t di_cowextsize; /* basic cow extent size for file */ xfs_ictimestamp_t di_crtime; /* time created */ }; diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index bbcc8c7a44b3..5dd56d3dbb3a 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -121,6 +121,26 @@ xfs_iformat_fork( return -EFSCORRUPTED; } + if (unlikely(xfs_is_reflink_inode(ip) && + (VFS_I(ip)->i_mode & S_IFMT) != S_IFREG)) { + xfs_warn(ip->i_mount, + "corrupt dinode %llu, wrong file type for reflink.", + ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat(reflink)", + XFS_ERRLEVEL_LOW, ip->i_mount, dip); + return -EFSCORRUPTED; + } + + if (unlikely(xfs_is_reflink_inode(ip) && + (ip->i_d.di_flags & XFS_DIFLAG_REALTIME))) { + xfs_warn(ip->i_mount, + "corrupt dinode %llu, has reflink+realtime flag set.", + ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat(reflink)", + XFS_ERRLEVEL_LOW, ip->i_mount, dip); + return -EFSCORRUPTED; + } + switch (VFS_I(ip)->i_mode & S_IFMT) { case S_IFIFO: case S_IFCHR: @@ -186,9 +206,14 @@ xfs_iformat_fork( XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount); return -EFSCORRUPTED; } - if (error) { + if (error) return error; + + if (xfs_is_reflink_inode(ip)) { + ASSERT(ip->i_cowfp == NULL); + xfs_ifork_init_cow(ip); } + if (!XFS_DFORK_Q(dip)) return 0; @@ -208,7 +233,8 @@ xfs_iformat_fork( XFS_CORRUPTION_ERROR("xfs_iformat(8)", XFS_ERRLEVEL_LOW, ip->i_mount, dip); - return -EFSCORRUPTED; + error = -EFSCORRUPTED; + break; } error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); @@ -226,6 +252,9 @@ xfs_iformat_fork( if (error) { kmem_zone_free(xfs_ifork_zone, ip->i_afp); ip->i_afp = NULL; + if (ip->i_cowfp) + kmem_zone_free(xfs_ifork_zone, ip->i_cowfp); + ip->i_cowfp = NULL; xfs_idestroy_fork(ip, XFS_DATA_FORK); } return error; @@ -740,6 +769,9 @@ xfs_idestroy_fork( if (whichfork == XFS_ATTR_FORK) { kmem_zone_free(xfs_ifork_zone, ip->i_afp); ip->i_afp = NULL; + } else if (whichfork == XFS_COW_FORK) { + kmem_zone_free(xfs_ifork_zone, ip->i_cowfp); + ip->i_cowfp = NULL; } } @@ -927,6 +959,19 @@ xfs_iext_get_ext( } } +/* Convert bmap state flags to an inode fork. */ +struct xfs_ifork * +xfs_iext_state_to_fork( + struct xfs_inode *ip, + int state) +{ + if (state & BMAP_COWFORK) + return ip->i_cowfp; + else if (state & BMAP_ATTRFORK) + return ip->i_afp; + return &ip->i_df; +} + /* * Insert new item(s) into the extent records for incore inode * fork 'ifp'. 'count' new items are inserted at index 'idx'. @@ -939,7 +984,7 @@ xfs_iext_insert( xfs_bmbt_irec_t *new, /* items to insert */ int state) /* type of extent conversion */ { - xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; + xfs_ifork_t *ifp = xfs_iext_state_to_fork(ip, state); xfs_extnum_t i; /* extent record index */ trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_); @@ -1189,7 +1234,7 @@ xfs_iext_remove( int ext_diff, /* number of extents to remove */ int state) /* type of extent conversion */ { - xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; + xfs_ifork_t *ifp = xfs_iext_state_to_fork(ip, state); xfs_extnum_t nextents; /* number of extents in file */ int new_size; /* size of extents after removal */ @@ -1934,3 +1979,20 @@ xfs_iext_irec_update_extoffs( ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff; } } + +/* + * Initialize an inode's copy-on-write fork. + */ +void +xfs_ifork_init_cow( + struct xfs_inode *ip) +{ + if (ip->i_cowfp) + return; + + ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone, + KM_SLEEP | KM_NOFS); + ip->i_cowfp->if_flags = XFS_IFEXTENTS; + ip->i_cformat = XFS_DINODE_FMT_EXTENTS; + ip->i_cnextents = 0; +} diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index f95e072ae646..c9476f50e32d 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -92,7 +92,9 @@ typedef struct xfs_ifork { #define XFS_IFORK_PTR(ip,w) \ ((w) == XFS_DATA_FORK ? \ &(ip)->i_df : \ - (ip)->i_afp) + ((w) == XFS_ATTR_FORK ? \ + (ip)->i_afp : \ + (ip)->i_cowfp)) #define XFS_IFORK_DSIZE(ip) \ (XFS_IFORK_Q(ip) ? \ XFS_IFORK_BOFF(ip) : \ @@ -105,26 +107,38 @@ typedef struct xfs_ifork { #define XFS_IFORK_SIZE(ip,w) \ ((w) == XFS_DATA_FORK ? \ XFS_IFORK_DSIZE(ip) : \ - XFS_IFORK_ASIZE(ip)) + ((w) == XFS_ATTR_FORK ? \ + XFS_IFORK_ASIZE(ip) : \ + 0)) #define XFS_IFORK_FORMAT(ip,w) \ ((w) == XFS_DATA_FORK ? \ (ip)->i_d.di_format : \ - (ip)->i_d.di_aformat) + ((w) == XFS_ATTR_FORK ? \ + (ip)->i_d.di_aformat : \ + (ip)->i_cformat)) #define XFS_IFORK_FMT_SET(ip,w,n) \ ((w) == XFS_DATA_FORK ? \ ((ip)->i_d.di_format = (n)) : \ - ((ip)->i_d.di_aformat = (n))) + ((w) == XFS_ATTR_FORK ? \ + ((ip)->i_d.di_aformat = (n)) : \ + ((ip)->i_cformat = (n)))) #define XFS_IFORK_NEXTENTS(ip,w) \ ((w) == XFS_DATA_FORK ? \ (ip)->i_d.di_nextents : \ - (ip)->i_d.di_anextents) + ((w) == XFS_ATTR_FORK ? \ + (ip)->i_d.di_anextents : \ + (ip)->i_cnextents)) #define XFS_IFORK_NEXT_SET(ip,w,n) \ ((w) == XFS_DATA_FORK ? \ ((ip)->i_d.di_nextents = (n)) : \ - ((ip)->i_d.di_anextents = (n))) + ((w) == XFS_ATTR_FORK ? \ + ((ip)->i_d.di_anextents = (n)) : \ + ((ip)->i_cnextents = (n)))) #define XFS_IFORK_MAXEXT(ip, w) \ (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t)) +struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state); + int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *); void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, struct xfs_inode_log_item *, int); @@ -169,4 +183,6 @@ void xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int); extern struct kmem_zone *xfs_ifork_zone; +extern void xfs_ifork_init_cow(struct xfs_inode *ip); + #endif /* __XFS_INODE_FORK_H__ */ diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index fc5eef85d61e..083cdd6d6c28 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -112,7 +112,11 @@ static inline uint xlog_get_cycle(char *ptr) #define XLOG_REG_TYPE_ICREATE 20 #define XLOG_REG_TYPE_RUI_FORMAT 21 #define XLOG_REG_TYPE_RUD_FORMAT 22 -#define XLOG_REG_TYPE_MAX 22 +#define XLOG_REG_TYPE_CUI_FORMAT 23 +#define XLOG_REG_TYPE_CUD_FORMAT 24 +#define XLOG_REG_TYPE_BUI_FORMAT 25 +#define XLOG_REG_TYPE_BUD_FORMAT 26 +#define XLOG_REG_TYPE_MAX 26 /* * Flags to log operation header @@ -231,6 +235,10 @@ typedef struct xfs_trans_header { #define XFS_LI_ICREATE 0x123f #define XFS_LI_RUI 0x1240 /* rmap update intent */ #define XFS_LI_RUD 0x1241 +#define XFS_LI_CUI 0x1242 /* refcount update intent */ +#define XFS_LI_CUD 0x1243 +#define XFS_LI_BUI 0x1244 /* bmbt update intent */ +#define XFS_LI_BUD 0x1245 #define XFS_LI_TYPE_DESC \ { XFS_LI_EFI, "XFS_LI_EFI" }, \ @@ -242,7 +250,11 @@ typedef struct xfs_trans_header { { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" }, \ { XFS_LI_ICREATE, "XFS_LI_ICREATE" }, \ { XFS_LI_RUI, "XFS_LI_RUI" }, \ - { XFS_LI_RUD, "XFS_LI_RUD" } + { XFS_LI_RUD, "XFS_LI_RUD" }, \ + { XFS_LI_CUI, "XFS_LI_CUI" }, \ + { XFS_LI_CUD, "XFS_LI_CUD" }, \ + { XFS_LI_BUI, "XFS_LI_BUI" }, \ + { XFS_LI_BUD, "XFS_LI_BUD" } /* * Inode Log Item Format definitions. @@ -411,7 +423,8 @@ struct xfs_log_dinode { __uint64_t di_changecount; /* number of attribute changes */ xfs_lsn_t di_lsn; /* flush sequence */ __uint64_t di_flags2; /* more random flags */ - __uint8_t di_pad2[16]; /* more padding for future expansion */ + __uint32_t di_cowextsize; /* basic cow extent size for file */ + __uint8_t di_pad2[12]; /* more padding for future expansion */ /* fields only written to during inode creation */ xfs_ictimestamp_t di_crtime; /* time created */ @@ -622,8 +635,11 @@ struct xfs_map_extent { /* rmap me_flags: upper bits are flags, lower byte is type code */ #define XFS_RMAP_EXTENT_MAP 1 +#define XFS_RMAP_EXTENT_MAP_SHARED 2 #define XFS_RMAP_EXTENT_UNMAP 3 +#define XFS_RMAP_EXTENT_UNMAP_SHARED 4 #define XFS_RMAP_EXTENT_CONVERT 5 +#define XFS_RMAP_EXTENT_CONVERT_SHARED 6 #define XFS_RMAP_EXTENT_ALLOC 7 #define XFS_RMAP_EXTENT_FREE 8 #define XFS_RMAP_EXTENT_TYPE_MASK 0xFF @@ -671,6 +687,102 @@ struct xfs_rud_log_format { }; /* + * CUI/CUD (refcount update) log format definitions + */ +struct xfs_phys_extent { + __uint64_t pe_startblock; + __uint32_t pe_len; + __uint32_t pe_flags; +}; + +/* refcount pe_flags: upper bits are flags, lower byte is type code */ +/* Type codes are taken directly from enum xfs_refcount_intent_type. */ +#define XFS_REFCOUNT_EXTENT_TYPE_MASK 0xFF + +#define XFS_REFCOUNT_EXTENT_FLAGS (XFS_REFCOUNT_EXTENT_TYPE_MASK) + +/* + * This is the structure used to lay out a cui log item in the + * log. The cui_extents field is a variable size array whose + * size is given by cui_nextents. + */ +struct xfs_cui_log_format { + __uint16_t cui_type; /* cui log item type */ + __uint16_t cui_size; /* size of this item */ + __uint32_t cui_nextents; /* # extents to free */ + __uint64_t cui_id; /* cui identifier */ + struct xfs_phys_extent cui_extents[]; /* array of extents */ +}; + +static inline size_t +xfs_cui_log_format_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_cui_log_format) + + nr * sizeof(struct xfs_phys_extent); +} + +/* + * This is the structure used to lay out a cud log item in the + * log. The cud_extents array is a variable size array whose + * size is given by cud_nextents; + */ +struct xfs_cud_log_format { + __uint16_t cud_type; /* cud log item type */ + __uint16_t cud_size; /* size of this item */ + __uint32_t __pad; + __uint64_t cud_cui_id; /* id of corresponding cui */ +}; + +/* + * BUI/BUD (inode block mapping) log format definitions + */ + +/* bmbt me_flags: upper bits are flags, lower byte is type code */ +/* Type codes are taken directly from enum xfs_bmap_intent_type. */ +#define XFS_BMAP_EXTENT_TYPE_MASK 0xFF + +#define XFS_BMAP_EXTENT_ATTR_FORK (1U << 31) +#define XFS_BMAP_EXTENT_UNWRITTEN (1U << 30) + +#define XFS_BMAP_EXTENT_FLAGS (XFS_BMAP_EXTENT_TYPE_MASK | \ + XFS_BMAP_EXTENT_ATTR_FORK | \ + XFS_BMAP_EXTENT_UNWRITTEN) + +/* + * This is the structure used to lay out an bui log item in the + * log. The bui_extents field is a variable size array whose + * size is given by bui_nextents. + */ +struct xfs_bui_log_format { + __uint16_t bui_type; /* bui log item type */ + __uint16_t bui_size; /* size of this item */ + __uint32_t bui_nextents; /* # extents to free */ + __uint64_t bui_id; /* bui identifier */ + struct xfs_map_extent bui_extents[]; /* array of extents to bmap */ +}; + +static inline size_t +xfs_bui_log_format_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_bui_log_format) + + nr * sizeof(struct xfs_map_extent); +} + +/* + * This is the structure used to lay out an bud log item in the + * log. The bud_extents array is a variable size array whose + * size is given by bud_nextents; + */ +struct xfs_bud_log_format { + __uint16_t bud_type; /* bud log item type */ + __uint16_t bud_size; /* size of this item */ + __uint32_t __pad; + __uint64_t bud_bui_id; /* id of corresponding bui */ +}; + +/* * Dquot Log format definitions. * * The first two fields must be the type and size fitting into diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c new file mode 100644 index 000000000000..b177ef33cd4c --- /dev/null +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -0,0 +1,1698 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bmap.h" +#include "xfs_refcount_btree.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" +#include "xfs_bit.h" +#include "xfs_refcount.h" +#include "xfs_rmap.h" + +/* Allowable refcount adjustment amounts. */ +enum xfs_refc_adjust_op { + XFS_REFCOUNT_ADJUST_INCREASE = 1, + XFS_REFCOUNT_ADJUST_DECREASE = -1, + XFS_REFCOUNT_ADJUST_COW_ALLOC = 0, + XFS_REFCOUNT_ADJUST_COW_FREE = -1, +}; + +STATIC int __xfs_refcount_cow_alloc(struct xfs_btree_cur *rcur, + xfs_agblock_t agbno, xfs_extlen_t aglen, + struct xfs_defer_ops *dfops); +STATIC int __xfs_refcount_cow_free(struct xfs_btree_cur *rcur, + xfs_agblock_t agbno, xfs_extlen_t aglen, + struct xfs_defer_ops *dfops); + +/* + * Look up the first record less than or equal to [bno, len] in the btree + * given by cur. + */ +int +xfs_refcount_lookup_le( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + int *stat) +{ + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno, + XFS_LOOKUP_LE); + cur->bc_rec.rc.rc_startblock = bno; + cur->bc_rec.rc.rc_blockcount = 0; + return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); +} + +/* + * Look up the first record greater than or equal to [bno, len] in the btree + * given by cur. + */ +int +xfs_refcount_lookup_ge( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + int *stat) +{ + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno, + XFS_LOOKUP_GE); + cur->bc_rec.rc.rc_startblock = bno; + cur->bc_rec.rc.rc_blockcount = 0; + return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); +} + +/* Convert on-disk record to in-core format. */ +static inline void +xfs_refcount_btrec_to_irec( + union xfs_btree_rec *rec, + struct xfs_refcount_irec *irec) +{ + irec->rc_startblock = be32_to_cpu(rec->refc.rc_startblock); + irec->rc_blockcount = be32_to_cpu(rec->refc.rc_blockcount); + irec->rc_refcount = be32_to_cpu(rec->refc.rc_refcount); +} + +/* + * Get the data from the pointed-to record. + */ +int +xfs_refcount_get_rec( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *irec, + int *stat) +{ + union xfs_btree_rec *rec; + int error; + + error = xfs_btree_get_rec(cur, &rec, stat); + if (!error && *stat == 1) { + xfs_refcount_btrec_to_irec(rec, irec); + trace_xfs_refcount_get(cur->bc_mp, cur->bc_private.a.agno, + irec); + } + return error; +} + +/* + * Update the record referred to by cur to the value given + * by [bno, len, refcount]. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int +xfs_refcount_update( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *irec) +{ + union xfs_btree_rec rec; + int error; + + trace_xfs_refcount_update(cur->bc_mp, cur->bc_private.a.agno, irec); + rec.refc.rc_startblock = cpu_to_be32(irec->rc_startblock); + rec.refc.rc_blockcount = cpu_to_be32(irec->rc_blockcount); + rec.refc.rc_refcount = cpu_to_be32(irec->rc_refcount); + error = xfs_btree_update(cur, &rec); + if (error) + trace_xfs_refcount_update_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Insert the record referred to by cur to the value given + * by [bno, len, refcount]. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int +xfs_refcount_insert( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *irec, + int *i) +{ + int error; + + trace_xfs_refcount_insert(cur->bc_mp, cur->bc_private.a.agno, irec); + cur->bc_rec.rc.rc_startblock = irec->rc_startblock; + cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount; + cur->bc_rec.rc.rc_refcount = irec->rc_refcount; + error = xfs_btree_insert(cur, i); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error); +out_error: + if (error) + trace_xfs_refcount_insert_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Remove the record referred to by cur, then set the pointer to the spot + * where the record could be re-inserted, in case we want to increment or + * decrement the cursor. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int +xfs_refcount_delete( + struct xfs_btree_cur *cur, + int *i) +{ + struct xfs_refcount_irec irec; + int found_rec; + int error; + + error = xfs_refcount_get_rec(cur, &irec, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + trace_xfs_refcount_delete(cur->bc_mp, cur->bc_private.a.agno, &irec); + error = xfs_btree_delete(cur, i); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error); + if (error) + goto out_error; + error = xfs_refcount_lookup_ge(cur, irec.rc_startblock, &found_rec); +out_error: + if (error) + trace_xfs_refcount_delete_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Adjusting the Reference Count + * + * As stated elsewhere, the reference count btree (refcbt) stores + * >1 reference counts for extents of physical blocks. In this + * operation, we're either raising or lowering the reference count of + * some subrange stored in the tree: + * + * <------ adjustment range ------> + * ----+ +---+-----+ +--+--------+--------- + * 2 | | 3 | 4 | |17| 55 | 10 + * ----+ +---+-----+ +--+--------+--------- + * X axis is physical blocks number; + * reference counts are the numbers inside the rectangles + * + * The first thing we need to do is to ensure that there are no + * refcount extents crossing either boundary of the range to be + * adjusted. For any extent that does cross a boundary, split it into + * two extents so that we can increment the refcount of one of the + * pieces later: + * + * <------ adjustment range ------> + * ----+ +---+-----+ +--+--------+----+---- + * 2 | | 3 | 2 | |17| 55 | 10 | 10 + * ----+ +---+-----+ +--+--------+----+---- + * + * For this next step, let's assume that all the physical blocks in + * the adjustment range are mapped to a file and are therefore in use + * at least once. Therefore, we can infer that any gap in the + * refcount tree within the adjustment range represents a physical + * extent with refcount == 1: + * + * <------ adjustment range ------> + * ----+---+---+-----+-+--+--------+----+---- + * 2 |"1"| 3 | 2 |1|17| 55 | 10 | 10 + * ----+---+---+-----+-+--+--------+----+---- + * ^ + * + * For each extent that falls within the interval range, figure out + * which extent is to the left or the right of that extent. Now we + * have a left, current, and right extent. If the new reference count + * of the center extent enables us to merge left, center, and right + * into one record covering all three, do so. If the center extent is + * at the left end of the range, abuts the left extent, and its new + * reference count matches the left extent's record, then merge them. + * If the center extent is at the right end of the range, abuts the + * right extent, and the reference counts match, merge those. In the + * example, we can left merge (assuming an increment operation): + * + * <------ adjustment range ------> + * --------+---+-----+-+--+--------+----+---- + * 2 | 3 | 2 |1|17| 55 | 10 | 10 + * --------+---+-----+-+--+--------+----+---- + * ^ + * + * For all other extents within the range, adjust the reference count + * or delete it if the refcount falls below 2. If we were + * incrementing, the end result looks like this: + * + * <------ adjustment range ------> + * --------+---+-----+-+--+--------+----+---- + * 2 | 4 | 3 |2|18| 56 | 11 | 10 + * --------+---+-----+-+--+--------+----+---- + * + * The result of a decrement operation looks as such: + * + * <------ adjustment range ------> + * ----+ +---+ +--+--------+----+---- + * 2 | | 2 | |16| 54 | 9 | 10 + * ----+ +---+ +--+--------+----+---- + * DDDD 111111DD + * + * The blocks marked "D" are freed; the blocks marked "1" are only + * referenced once and therefore the record is removed from the + * refcount btree. + */ + +/* Next block after this extent. */ +static inline xfs_agblock_t +xfs_refc_next( + struct xfs_refcount_irec *rc) +{ + return rc->rc_startblock + rc->rc_blockcount; +} + +/* + * Split a refcount extent that crosses agbno. + */ +STATIC int +xfs_refcount_split_extent( + struct xfs_btree_cur *cur, + xfs_agblock_t agbno, + bool *shape_changed) +{ + struct xfs_refcount_irec rcext, tmp; + int found_rec; + int error; + + *shape_changed = false; + error = xfs_refcount_lookup_le(cur, agbno, &found_rec); + if (error) + goto out_error; + if (!found_rec) + return 0; + + error = xfs_refcount_get_rec(cur, &rcext, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + if (rcext.rc_startblock == agbno || xfs_refc_next(&rcext) <= agbno) + return 0; + + *shape_changed = true; + trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_private.a.agno, + &rcext, agbno); + + /* Establish the right extent. */ + tmp = rcext; + tmp.rc_startblock = agbno; + tmp.rc_blockcount -= (agbno - rcext.rc_startblock); + error = xfs_refcount_update(cur, &tmp); + if (error) + goto out_error; + + /* Insert the left extent. */ + tmp = rcext; + tmp.rc_blockcount = agbno - rcext.rc_startblock; + error = xfs_refcount_insert(cur, &tmp, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + return error; + +out_error: + trace_xfs_refcount_split_extent_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Merge the left, center, and right extents. + */ +STATIC int +xfs_refcount_merge_center_extents( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *left, + struct xfs_refcount_irec *center, + struct xfs_refcount_irec *right, + unsigned long long extlen, + xfs_agblock_t *agbno, + xfs_extlen_t *aglen) +{ + int error; + int found_rec; + + trace_xfs_refcount_merge_center_extents(cur->bc_mp, + cur->bc_private.a.agno, left, center, right); + + /* + * Make sure the center and right extents are not in the btree. + * If the center extent was synthesized, the first delete call + * removes the right extent and we skip the second deletion. + * If center and right were in the btree, then the first delete + * call removes the center and the second one removes the right + * extent. + */ + error = xfs_refcount_lookup_ge(cur, center->rc_startblock, + &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + + error = xfs_refcount_delete(cur, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + + if (center->rc_refcount > 1) { + error = xfs_refcount_delete(cur, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, + out_error); + } + + /* Enlarge the left extent. */ + error = xfs_refcount_lookup_le(cur, left->rc_startblock, + &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + + left->rc_blockcount = extlen; + error = xfs_refcount_update(cur, left); + if (error) + goto out_error; + + *aglen = 0; + return error; + +out_error: + trace_xfs_refcount_merge_center_extents_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Merge with the left extent. + */ +STATIC int +xfs_refcount_merge_left_extent( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *left, + struct xfs_refcount_irec *cleft, + xfs_agblock_t *agbno, + xfs_extlen_t *aglen) +{ + int error; + int found_rec; + + trace_xfs_refcount_merge_left_extent(cur->bc_mp, + cur->bc_private.a.agno, left, cleft); + + /* If the extent at agbno (cleft) wasn't synthesized, remove it. */ + if (cleft->rc_refcount > 1) { + error = xfs_refcount_lookup_le(cur, cleft->rc_startblock, + &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, + out_error); + + error = xfs_refcount_delete(cur, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, + out_error); + } + + /* Enlarge the left extent. */ + error = xfs_refcount_lookup_le(cur, left->rc_startblock, + &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + + left->rc_blockcount += cleft->rc_blockcount; + error = xfs_refcount_update(cur, left); + if (error) + goto out_error; + + *agbno += cleft->rc_blockcount; + *aglen -= cleft->rc_blockcount; + return error; + +out_error: + trace_xfs_refcount_merge_left_extent_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Merge with the right extent. + */ +STATIC int +xfs_refcount_merge_right_extent( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *right, + struct xfs_refcount_irec *cright, + xfs_agblock_t *agbno, + xfs_extlen_t *aglen) +{ + int error; + int found_rec; + + trace_xfs_refcount_merge_right_extent(cur->bc_mp, + cur->bc_private.a.agno, cright, right); + + /* + * If the extent ending at agbno+aglen (cright) wasn't synthesized, + * remove it. + */ + if (cright->rc_refcount > 1) { + error = xfs_refcount_lookup_le(cur, cright->rc_startblock, + &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, + out_error); + + error = xfs_refcount_delete(cur, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, + out_error); + } + + /* Enlarge the right extent. */ + error = xfs_refcount_lookup_le(cur, right->rc_startblock, + &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + + right->rc_startblock -= cright->rc_blockcount; + right->rc_blockcount += cright->rc_blockcount; + error = xfs_refcount_update(cur, right); + if (error) + goto out_error; + + *aglen -= cright->rc_blockcount; + return error; + +out_error: + trace_xfs_refcount_merge_right_extent_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +#define XFS_FIND_RCEXT_SHARED 1 +#define XFS_FIND_RCEXT_COW 2 +/* + * Find the left extent and the one after it (cleft). This function assumes + * that we've already split any extent crossing agbno. + */ +STATIC int +xfs_refcount_find_left_extents( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *left, + struct xfs_refcount_irec *cleft, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + int flags) +{ + struct xfs_refcount_irec tmp; + int error; + int found_rec; + + left->rc_startblock = cleft->rc_startblock = NULLAGBLOCK; + error = xfs_refcount_lookup_le(cur, agbno - 1, &found_rec); + if (error) + goto out_error; + if (!found_rec) + return 0; + + error = xfs_refcount_get_rec(cur, &tmp, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + + if (xfs_refc_next(&tmp) != agbno) + return 0; + if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2) + return 0; + if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1) + return 0; + /* We have a left extent; retrieve (or invent) the next right one */ + *left = tmp; + + error = xfs_btree_increment(cur, 0, &found_rec); + if (error) + goto out_error; + if (found_rec) { + error = xfs_refcount_get_rec(cur, &tmp, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, + out_error); + + /* if tmp starts at the end of our range, just use that */ + if (tmp.rc_startblock == agbno) + *cleft = tmp; + else { + /* + * There's a gap in the refcntbt at the start of the + * range we're interested in (refcount == 1) so + * synthesize the implied extent and pass it back. + * We assume here that the agbno/aglen range was + * passed in from a data fork extent mapping and + * therefore is allocated to exactly one owner. + */ + cleft->rc_startblock = agbno; + cleft->rc_blockcount = min(aglen, + tmp.rc_startblock - agbno); + cleft->rc_refcount = 1; + } + } else { + /* + * No extents, so pretend that there's one covering the whole + * range. + */ + cleft->rc_startblock = agbno; + cleft->rc_blockcount = aglen; + cleft->rc_refcount = 1; + } + trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_private.a.agno, + left, cleft, agbno); + return error; + +out_error: + trace_xfs_refcount_find_left_extent_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Find the right extent and the one before it (cright). This function + * assumes that we've already split any extents crossing agbno + aglen. + */ +STATIC int +xfs_refcount_find_right_extents( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *right, + struct xfs_refcount_irec *cright, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + int flags) +{ + struct xfs_refcount_irec tmp; + int error; + int found_rec; + + right->rc_startblock = cright->rc_startblock = NULLAGBLOCK; + error = xfs_refcount_lookup_ge(cur, agbno + aglen, &found_rec); + if (error) + goto out_error; + if (!found_rec) + return 0; + + error = xfs_refcount_get_rec(cur, &tmp, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + + if (tmp.rc_startblock != agbno + aglen) + return 0; + if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2) + return 0; + if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1) + return 0; + /* We have a right extent; retrieve (or invent) the next left one */ + *right = tmp; + + error = xfs_btree_decrement(cur, 0, &found_rec); + if (error) + goto out_error; + if (found_rec) { + error = xfs_refcount_get_rec(cur, &tmp, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, + out_error); + + /* if tmp ends at the end of our range, just use that */ + if (xfs_refc_next(&tmp) == agbno + aglen) + *cright = tmp; + else { + /* + * There's a gap in the refcntbt at the end of the + * range we're interested in (refcount == 1) so + * create the implied extent and pass it back. + * We assume here that the agbno/aglen range was + * passed in from a data fork extent mapping and + * therefore is allocated to exactly one owner. + */ + cright->rc_startblock = max(agbno, xfs_refc_next(&tmp)); + cright->rc_blockcount = right->rc_startblock - + cright->rc_startblock; + cright->rc_refcount = 1; + } + } else { + /* + * No extents, so pretend that there's one covering the whole + * range. + */ + cright->rc_startblock = agbno; + cright->rc_blockcount = aglen; + cright->rc_refcount = 1; + } + trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_private.a.agno, + cright, right, agbno + aglen); + return error; + +out_error: + trace_xfs_refcount_find_right_extent_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* Is this extent valid? */ +static inline bool +xfs_refc_valid( + struct xfs_refcount_irec *rc) +{ + return rc->rc_startblock != NULLAGBLOCK; +} + +/* + * Try to merge with any extents on the boundaries of the adjustment range. + */ +STATIC int +xfs_refcount_merge_extents( + struct xfs_btree_cur *cur, + xfs_agblock_t *agbno, + xfs_extlen_t *aglen, + enum xfs_refc_adjust_op adjust, + int flags, + bool *shape_changed) +{ + struct xfs_refcount_irec left = {0}, cleft = {0}; + struct xfs_refcount_irec cright = {0}, right = {0}; + int error; + unsigned long long ulen; + bool cequal; + + *shape_changed = false; + /* + * Find the extent just below agbno [left], just above agbno [cleft], + * just below (agbno + aglen) [cright], and just above (agbno + aglen) + * [right]. + */ + error = xfs_refcount_find_left_extents(cur, &left, &cleft, *agbno, + *aglen, flags); + if (error) + return error; + error = xfs_refcount_find_right_extents(cur, &right, &cright, *agbno, + *aglen, flags); + if (error) + return error; + + /* No left or right extent to merge; exit. */ + if (!xfs_refc_valid(&left) && !xfs_refc_valid(&right)) + return 0; + + cequal = (cleft.rc_startblock == cright.rc_startblock) && + (cleft.rc_blockcount == cright.rc_blockcount); + + /* Try to merge left, cleft, and right. cleft must == cright. */ + ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount + + right.rc_blockcount; + if (xfs_refc_valid(&left) && xfs_refc_valid(&right) && + xfs_refc_valid(&cleft) && xfs_refc_valid(&cright) && cequal && + left.rc_refcount == cleft.rc_refcount + adjust && + right.rc_refcount == cleft.rc_refcount + adjust && + ulen < MAXREFCEXTLEN) { + *shape_changed = true; + return xfs_refcount_merge_center_extents(cur, &left, &cleft, + &right, ulen, agbno, aglen); + } + + /* Try to merge left and cleft. */ + ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount; + if (xfs_refc_valid(&left) && xfs_refc_valid(&cleft) && + left.rc_refcount == cleft.rc_refcount + adjust && + ulen < MAXREFCEXTLEN) { + *shape_changed = true; + error = xfs_refcount_merge_left_extent(cur, &left, &cleft, + agbno, aglen); + if (error) + return error; + + /* + * If we just merged left + cleft and cleft == cright, + * we no longer have a cright to merge with right. We're done. + */ + if (cequal) + return 0; + } + + /* Try to merge cright and right. */ + ulen = (unsigned long long)right.rc_blockcount + cright.rc_blockcount; + if (xfs_refc_valid(&right) && xfs_refc_valid(&cright) && + right.rc_refcount == cright.rc_refcount + adjust && + ulen < MAXREFCEXTLEN) { + *shape_changed = true; + return xfs_refcount_merge_right_extent(cur, &right, &cright, + agbno, aglen); + } + + return error; +} + +/* + * While we're adjusting the refcounts records of an extent, we have + * to keep an eye on the number of extents we're dirtying -- run too + * many in a single transaction and we'll exceed the transaction's + * reservation and crash the fs. Each record adds 12 bytes to the + * log (plus any key updates) so we'll conservatively assume 24 bytes + * per record. We must also leave space for btree splits on both ends + * of the range and space for the CUD and a new CUI. + * + * XXX: This is a pretty hand-wavy estimate. The penalty for guessing + * true incorrectly is a shutdown FS; the penalty for guessing false + * incorrectly is more transaction rolls than might be necessary. + * Be conservative here. + */ +static bool +xfs_refcount_still_have_space( + struct xfs_btree_cur *cur) +{ + unsigned long overhead; + + overhead = cur->bc_private.a.priv.refc.shape_changes * + xfs_allocfree_log_count(cur->bc_mp, 1); + overhead *= cur->bc_mp->m_sb.sb_blocksize; + + /* + * Only allow 2 refcount extent updates per transaction if the + * refcount continue update "error" has been injected. + */ + if (cur->bc_private.a.priv.refc.nr_ops > 2 && + XFS_TEST_ERROR(false, cur->bc_mp, + XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE, + XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE)) + return false; + + if (cur->bc_private.a.priv.refc.nr_ops == 0) + return true; + else if (overhead > cur->bc_tp->t_log_res) + return false; + return cur->bc_tp->t_log_res - overhead > + cur->bc_private.a.priv.refc.nr_ops * 32; +} + +/* + * Adjust the refcounts of middle extents. At this point we should have + * split extents that crossed the adjustment range; merged with adjacent + * extents; and updated agbno/aglen to reflect the merges. Therefore, + * all we have to do is update the extents inside [agbno, agbno + aglen]. + */ +STATIC int +xfs_refcount_adjust_extents( + struct xfs_btree_cur *cur, + xfs_agblock_t *agbno, + xfs_extlen_t *aglen, + enum xfs_refc_adjust_op adj, + struct xfs_defer_ops *dfops, + struct xfs_owner_info *oinfo) +{ + struct xfs_refcount_irec ext, tmp; + int error; + int found_rec, found_tmp; + xfs_fsblock_t fsbno; + + /* Merging did all the work already. */ + if (*aglen == 0) + return 0; + + error = xfs_refcount_lookup_ge(cur, *agbno, &found_rec); + if (error) + goto out_error; + + while (*aglen > 0 && xfs_refcount_still_have_space(cur)) { + error = xfs_refcount_get_rec(cur, &ext, &found_rec); + if (error) + goto out_error; + if (!found_rec) { + ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks; + ext.rc_blockcount = 0; + ext.rc_refcount = 0; + } + + /* + * Deal with a hole in the refcount tree; if a file maps to + * these blocks and there's no refcountbt record, pretend that + * there is one with refcount == 1. + */ + if (ext.rc_startblock != *agbno) { + tmp.rc_startblock = *agbno; + tmp.rc_blockcount = min(*aglen, + ext.rc_startblock - *agbno); + tmp.rc_refcount = 1 + adj; + trace_xfs_refcount_modify_extent(cur->bc_mp, + cur->bc_private.a.agno, &tmp); + + /* + * Either cover the hole (increment) or + * delete the range (decrement). + */ + if (tmp.rc_refcount) { + error = xfs_refcount_insert(cur, &tmp, + &found_tmp); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + found_tmp == 1, out_error); + cur->bc_private.a.priv.refc.nr_ops++; + } else { + fsbno = XFS_AGB_TO_FSB(cur->bc_mp, + cur->bc_private.a.agno, + tmp.rc_startblock); + xfs_bmap_add_free(cur->bc_mp, dfops, fsbno, + tmp.rc_blockcount, oinfo); + } + + (*agbno) += tmp.rc_blockcount; + (*aglen) -= tmp.rc_blockcount; + + error = xfs_refcount_lookup_ge(cur, *agbno, + &found_rec); + if (error) + goto out_error; + } + + /* Stop if there's nothing left to modify */ + if (*aglen == 0 || !xfs_refcount_still_have_space(cur)) + break; + + /* + * Adjust the reference count and either update the tree + * (incr) or free the blocks (decr). + */ + if (ext.rc_refcount == MAXREFCOUNT) + goto skip; + ext.rc_refcount += adj; + trace_xfs_refcount_modify_extent(cur->bc_mp, + cur->bc_private.a.agno, &ext); + if (ext.rc_refcount > 1) { + error = xfs_refcount_update(cur, &ext); + if (error) + goto out_error; + cur->bc_private.a.priv.refc.nr_ops++; + } else if (ext.rc_refcount == 1) { + error = xfs_refcount_delete(cur, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + found_rec == 1, out_error); + cur->bc_private.a.priv.refc.nr_ops++; + goto advloop; + } else { + fsbno = XFS_AGB_TO_FSB(cur->bc_mp, + cur->bc_private.a.agno, + ext.rc_startblock); + xfs_bmap_add_free(cur->bc_mp, dfops, fsbno, + ext.rc_blockcount, oinfo); + } + +skip: + error = xfs_btree_increment(cur, 0, &found_rec); + if (error) + goto out_error; + +advloop: + (*agbno) += ext.rc_blockcount; + (*aglen) -= ext.rc_blockcount; + } + + return error; +out_error: + trace_xfs_refcount_modify_extent_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* Adjust the reference count of a range of AG blocks. */ +STATIC int +xfs_refcount_adjust( + struct xfs_btree_cur *cur, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + xfs_agblock_t *new_agbno, + xfs_extlen_t *new_aglen, + enum xfs_refc_adjust_op adj, + struct xfs_defer_ops *dfops, + struct xfs_owner_info *oinfo) +{ + bool shape_changed; + int shape_changes = 0; + int error; + + *new_agbno = agbno; + *new_aglen = aglen; + if (adj == XFS_REFCOUNT_ADJUST_INCREASE) + trace_xfs_refcount_increase(cur->bc_mp, cur->bc_private.a.agno, + agbno, aglen); + else + trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_private.a.agno, + agbno, aglen); + + /* + * Ensure that no rcextents cross the boundary of the adjustment range. + */ + error = xfs_refcount_split_extent(cur, agbno, &shape_changed); + if (error) + goto out_error; + if (shape_changed) + shape_changes++; + + error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed); + if (error) + goto out_error; + if (shape_changed) + shape_changes++; + + /* + * Try to merge with the left or right extents of the range. + */ + error = xfs_refcount_merge_extents(cur, new_agbno, new_aglen, adj, + XFS_FIND_RCEXT_SHARED, &shape_changed); + if (error) + goto out_error; + if (shape_changed) + shape_changes++; + if (shape_changes) + cur->bc_private.a.priv.refc.shape_changes++; + + /* Now that we've taken care of the ends, adjust the middle extents */ + error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen, + adj, dfops, oinfo); + if (error) + goto out_error; + + return 0; + +out_error: + trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_private.a.agno, + error, _RET_IP_); + return error; +} + +/* Clean up after calling xfs_refcount_finish_one. */ +void +xfs_refcount_finish_one_cleanup( + struct xfs_trans *tp, + struct xfs_btree_cur *rcur, + int error) +{ + struct xfs_buf *agbp; + + if (rcur == NULL) + return; + agbp = rcur->bc_private.a.agbp; + xfs_btree_del_cursor(rcur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + if (error) + xfs_trans_brelse(tp, agbp); +} + +/* + * Process one of the deferred refcount operations. We pass back the + * btree cursor to maintain our lock on the btree between calls. + * This saves time and eliminates a buffer deadlock between the + * superblock and the AGF because we'll always grab them in the same + * order. + */ +int +xfs_refcount_finish_one( + struct xfs_trans *tp, + struct xfs_defer_ops *dfops, + enum xfs_refcount_intent_type type, + xfs_fsblock_t startblock, + xfs_extlen_t blockcount, + xfs_fsblock_t *new_fsb, + xfs_extlen_t *new_len, + struct xfs_btree_cur **pcur) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_btree_cur *rcur; + struct xfs_buf *agbp = NULL; + int error = 0; + xfs_agnumber_t agno; + xfs_agblock_t bno; + xfs_agblock_t new_agbno; + unsigned long nr_ops = 0; + int shape_changes = 0; + + agno = XFS_FSB_TO_AGNO(mp, startblock); + ASSERT(agno != NULLAGNUMBER); + bno = XFS_FSB_TO_AGBNO(mp, startblock); + + trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, startblock), + type, XFS_FSB_TO_AGBNO(mp, startblock), + blockcount); + + if (XFS_TEST_ERROR(false, mp, + XFS_ERRTAG_REFCOUNT_FINISH_ONE, + XFS_RANDOM_REFCOUNT_FINISH_ONE)) + return -EIO; + + /* + * If we haven't gotten a cursor or the cursor AG doesn't match + * the startblock, get one now. + */ + rcur = *pcur; + if (rcur != NULL && rcur->bc_private.a.agno != agno) { + nr_ops = rcur->bc_private.a.priv.refc.nr_ops; + shape_changes = rcur->bc_private.a.priv.refc.shape_changes; + xfs_refcount_finish_one_cleanup(tp, rcur, 0); + rcur = NULL; + *pcur = NULL; + } + if (rcur == NULL) { + error = xfs_alloc_read_agf(tp->t_mountp, tp, agno, + XFS_ALLOC_FLAG_FREEING, &agbp); + if (error) + return error; + if (!agbp) + return -EFSCORRUPTED; + + rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, dfops); + if (!rcur) { + error = -ENOMEM; + goto out_cur; + } + rcur->bc_private.a.priv.refc.nr_ops = nr_ops; + rcur->bc_private.a.priv.refc.shape_changes = shape_changes; + } + *pcur = rcur; + + switch (type) { + case XFS_REFCOUNT_INCREASE: + error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, + new_len, XFS_REFCOUNT_ADJUST_INCREASE, dfops, NULL); + *new_fsb = XFS_AGB_TO_FSB(mp, agno, new_agbno); + break; + case XFS_REFCOUNT_DECREASE: + error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, + new_len, XFS_REFCOUNT_ADJUST_DECREASE, dfops, NULL); + *new_fsb = XFS_AGB_TO_FSB(mp, agno, new_agbno); + break; + case XFS_REFCOUNT_ALLOC_COW: + *new_fsb = startblock + blockcount; + *new_len = 0; + error = __xfs_refcount_cow_alloc(rcur, bno, blockcount, dfops); + break; + case XFS_REFCOUNT_FREE_COW: + *new_fsb = startblock + blockcount; + *new_len = 0; + error = __xfs_refcount_cow_free(rcur, bno, blockcount, dfops); + break; + default: + ASSERT(0); + error = -EFSCORRUPTED; + } + if (!error && *new_len > 0) + trace_xfs_refcount_finish_one_leftover(mp, agno, type, + bno, blockcount, new_agbno, *new_len); + return error; + +out_cur: + xfs_trans_brelse(tp, agbp); + + return error; +} + +/* + * Record a refcount intent for later processing. + */ +static int +__xfs_refcount_add( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + enum xfs_refcount_intent_type type, + xfs_fsblock_t startblock, + xfs_extlen_t blockcount) +{ + struct xfs_refcount_intent *ri; + + trace_xfs_refcount_defer(mp, XFS_FSB_TO_AGNO(mp, startblock), + type, XFS_FSB_TO_AGBNO(mp, startblock), + blockcount); + + ri = kmem_alloc(sizeof(struct xfs_refcount_intent), + KM_SLEEP | KM_NOFS); + INIT_LIST_HEAD(&ri->ri_list); + ri->ri_type = type; + ri->ri_startblock = startblock; + ri->ri_blockcount = blockcount; + + xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list); + return 0; +} + +/* + * Increase the reference count of the blocks backing a file's extent. + */ +int +xfs_refcount_increase_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + struct xfs_bmbt_irec *PREV) +{ + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return 0; + + return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_INCREASE, + PREV->br_startblock, PREV->br_blockcount); +} + +/* + * Decrease the reference count of the blocks backing a file's extent. + */ +int +xfs_refcount_decrease_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + struct xfs_bmbt_irec *PREV) +{ + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return 0; + + return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_DECREASE, + PREV->br_startblock, PREV->br_blockcount); +} + +/* + * Given an AG extent, find the lowest-numbered run of shared blocks + * within that range and return the range in fbno/flen. If + * find_end_of_shared is set, return the longest contiguous extent of + * shared blocks; if not, just return the first extent we find. If no + * shared blocks are found, fbno and flen will be set to NULLAGBLOCK + * and 0, respectively. + */ +int +xfs_refcount_find_shared( + struct xfs_btree_cur *cur, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + xfs_agblock_t *fbno, + xfs_extlen_t *flen, + bool find_end_of_shared) +{ + struct xfs_refcount_irec tmp; + int i; + int have; + int error; + + trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_private.a.agno, + agbno, aglen); + + /* By default, skip the whole range */ + *fbno = NULLAGBLOCK; + *flen = 0; + + /* Try to find a refcount extent that crosses the start */ + error = xfs_refcount_lookup_le(cur, agbno, &have); + if (error) + goto out_error; + if (!have) { + /* No left extent, look at the next one */ + error = xfs_btree_increment(cur, 0, &have); + if (error) + goto out_error; + if (!have) + goto done; + } + error = xfs_refcount_get_rec(cur, &tmp, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error); + + /* If the extent ends before the start, look at the next one */ + if (tmp.rc_startblock + tmp.rc_blockcount <= agbno) { + error = xfs_btree_increment(cur, 0, &have); + if (error) + goto out_error; + if (!have) + goto done; + error = xfs_refcount_get_rec(cur, &tmp, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error); + } + + /* If the extent starts after the range we want, bail out */ + if (tmp.rc_startblock >= agbno + aglen) + goto done; + + /* We found the start of a shared extent! */ + if (tmp.rc_startblock < agbno) { + tmp.rc_blockcount -= (agbno - tmp.rc_startblock); + tmp.rc_startblock = agbno; + } + + *fbno = tmp.rc_startblock; + *flen = min(tmp.rc_blockcount, agbno + aglen - *fbno); + if (!find_end_of_shared) + goto done; + + /* Otherwise, find the end of this shared extent */ + while (*fbno + *flen < agbno + aglen) { + error = xfs_btree_increment(cur, 0, &have); + if (error) + goto out_error; + if (!have) + break; + error = xfs_refcount_get_rec(cur, &tmp, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error); + if (tmp.rc_startblock >= agbno + aglen || + tmp.rc_startblock != *fbno + *flen) + break; + *flen = min(*flen + tmp.rc_blockcount, agbno + aglen - *fbno); + } + +done: + trace_xfs_refcount_find_shared_result(cur->bc_mp, + cur->bc_private.a.agno, *fbno, *flen); + +out_error: + if (error) + trace_xfs_refcount_find_shared_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Recovering CoW Blocks After a Crash + * + * Due to the way that the copy on write mechanism works, there's a window of + * opportunity in which we can lose track of allocated blocks during a crash. + * Because CoW uses delayed allocation in the in-core CoW fork, writeback + * causes blocks to be allocated and stored in the CoW fork. The blocks are + * no longer in the free space btree but are not otherwise recorded anywhere + * until the write completes and the blocks are mapped into the file. A crash + * in between allocation and remapping results in the replacement blocks being + * lost. This situation is exacerbated by the CoW extent size hint because + * allocations can hang around for long time. + * + * However, there is a place where we can record these allocations before they + * become mappings -- the reference count btree. The btree does not record + * extents with refcount == 1, so we can record allocations with a refcount of + * 1. Blocks being used for CoW writeout cannot be shared, so there should be + * no conflict with shared block records. These mappings should be created + * when we allocate blocks to the CoW fork and deleted when they're removed + * from the CoW fork. + * + * Minor nit: records for in-progress CoW allocations and records for shared + * extents must never be merged, to preserve the property that (except for CoW + * allocations) there are no refcount btree entries with refcount == 1. The + * only time this could potentially happen is when unsharing a block that's + * adjacent to CoW allocations, so we must be careful to avoid this. + * + * At mount time we recover lost CoW allocations by searching the refcount + * btree for these refcount == 1 mappings. These represent CoW allocations + * that were in progress at the time the filesystem went down, so we can free + * them to get the space back. + * + * This mechanism is superior to creating EFIs for unmapped CoW extents for + * several reasons -- first, EFIs pin the tail of the log and would have to be + * periodically relogged to avoid filling up the log. Second, CoW completions + * will have to file an EFD and create new EFIs for whatever remains in the + * CoW fork; this partially takes care of (1) but extent-size reservations + * will have to periodically relog even if there's no writeout in progress. + * This can happen if the CoW extent size hint is set, which you really want. + * Third, EFIs cannot currently be automatically relogged into newer + * transactions to advance the log tail. Fourth, stuffing the log full of + * EFIs places an upper bound on the number of CoW allocations that can be + * held filesystem-wide at any given time. Recording them in the refcount + * btree doesn't require us to maintain any state in memory and doesn't pin + * the log. + */ +/* + * Adjust the refcounts of CoW allocations. These allocations are "magic" + * in that they're not referenced anywhere else in the filesystem, so we + * stash them in the refcount btree with a refcount of 1 until either file + * remapping (or CoW cancellation) happens. + */ +STATIC int +xfs_refcount_adjust_cow_extents( + struct xfs_btree_cur *cur, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + enum xfs_refc_adjust_op adj, + struct xfs_defer_ops *dfops, + struct xfs_owner_info *oinfo) +{ + struct xfs_refcount_irec ext, tmp; + int error; + int found_rec, found_tmp; + + if (aglen == 0) + return 0; + + /* Find any overlapping refcount records */ + error = xfs_refcount_lookup_ge(cur, agbno, &found_rec); + if (error) + goto out_error; + error = xfs_refcount_get_rec(cur, &ext, &found_rec); + if (error) + goto out_error; + if (!found_rec) { + ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks + + XFS_REFC_COW_START; + ext.rc_blockcount = 0; + ext.rc_refcount = 0; + } + + switch (adj) { + case XFS_REFCOUNT_ADJUST_COW_ALLOC: + /* Adding a CoW reservation, there should be nothing here. */ + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + ext.rc_startblock >= agbno + aglen, out_error); + + tmp.rc_startblock = agbno; + tmp.rc_blockcount = aglen; + tmp.rc_refcount = 1; + trace_xfs_refcount_modify_extent(cur->bc_mp, + cur->bc_private.a.agno, &tmp); + + error = xfs_refcount_insert(cur, &tmp, + &found_tmp); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + found_tmp == 1, out_error); + break; + case XFS_REFCOUNT_ADJUST_COW_FREE: + /* Removing a CoW reservation, there should be one extent. */ + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + ext.rc_startblock == agbno, out_error); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + ext.rc_blockcount == aglen, out_error); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + ext.rc_refcount == 1, out_error); + + ext.rc_refcount = 0; + trace_xfs_refcount_modify_extent(cur->bc_mp, + cur->bc_private.a.agno, &ext); + error = xfs_refcount_delete(cur, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + found_rec == 1, out_error); + break; + default: + ASSERT(0); + } + + return error; +out_error: + trace_xfs_refcount_modify_extent_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Add or remove refcount btree entries for CoW reservations. + */ +STATIC int +xfs_refcount_adjust_cow( + struct xfs_btree_cur *cur, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + enum xfs_refc_adjust_op adj, + struct xfs_defer_ops *dfops) +{ + bool shape_changed; + int error; + + agbno += XFS_REFC_COW_START; + + /* + * Ensure that no rcextents cross the boundary of the adjustment range. + */ + error = xfs_refcount_split_extent(cur, agbno, &shape_changed); + if (error) + goto out_error; + + error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed); + if (error) + goto out_error; + + /* + * Try to merge with the left or right extents of the range. + */ + error = xfs_refcount_merge_extents(cur, &agbno, &aglen, adj, + XFS_FIND_RCEXT_COW, &shape_changed); + if (error) + goto out_error; + + /* Now that we've taken care of the ends, adjust the middle extents */ + error = xfs_refcount_adjust_cow_extents(cur, agbno, aglen, adj, + dfops, NULL); + if (error) + goto out_error; + + return 0; + +out_error: + trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_private.a.agno, + error, _RET_IP_); + return error; +} + +/* + * Record a CoW allocation in the refcount btree. + */ +STATIC int +__xfs_refcount_cow_alloc( + struct xfs_btree_cur *rcur, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + struct xfs_defer_ops *dfops) +{ + int error; + + trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_private.a.agno, + agbno, aglen); + + /* Add refcount btree reservation */ + error = xfs_refcount_adjust_cow(rcur, agbno, aglen, + XFS_REFCOUNT_ADJUST_COW_ALLOC, dfops); + if (error) + return error; + + /* Add rmap entry */ + if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) { + error = xfs_rmap_alloc_extent(rcur->bc_mp, dfops, + rcur->bc_private.a.agno, + agbno, aglen, XFS_RMAP_OWN_COW); + if (error) + return error; + } + + return error; +} + +/* + * Remove a CoW allocation from the refcount btree. + */ +STATIC int +__xfs_refcount_cow_free( + struct xfs_btree_cur *rcur, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + struct xfs_defer_ops *dfops) +{ + int error; + + trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_private.a.agno, + agbno, aglen); + + /* Remove refcount btree reservation */ + error = xfs_refcount_adjust_cow(rcur, agbno, aglen, + XFS_REFCOUNT_ADJUST_COW_FREE, dfops); + if (error) + return error; + + /* Remove rmap entry */ + if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) { + error = xfs_rmap_free_extent(rcur->bc_mp, dfops, + rcur->bc_private.a.agno, + agbno, aglen, XFS_RMAP_OWN_COW); + if (error) + return error; + } + + return error; +} + +/* Record a CoW staging extent in the refcount btree. */ +int +xfs_refcount_alloc_cow_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + xfs_fsblock_t fsb, + xfs_extlen_t len) +{ + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return 0; + + return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_ALLOC_COW, + fsb, len); +} + +/* Forget a CoW staging event in the refcount btree. */ +int +xfs_refcount_free_cow_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + xfs_fsblock_t fsb, + xfs_extlen_t len) +{ + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return 0; + + return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_FREE_COW, + fsb, len); +} + +struct xfs_refcount_recovery { + struct list_head rr_list; + struct xfs_refcount_irec rr_rrec; +}; + +/* Stuff an extent on the recovery list. */ +STATIC int +xfs_refcount_recover_extent( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec, + void *priv) +{ + struct list_head *debris = priv; + struct xfs_refcount_recovery *rr; + + if (be32_to_cpu(rec->refc.rc_refcount) != 1) + return -EFSCORRUPTED; + + rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), KM_SLEEP); + xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); + list_add_tail(&rr->rr_list, debris); + + return 0; +} + +/* Find and remove leftover CoW reservations. */ +int +xfs_refcount_recover_cow_leftovers( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + struct xfs_trans *tp; + struct xfs_btree_cur *cur; + struct xfs_buf *agbp; + struct xfs_refcount_recovery *rr, *n; + struct list_head debris; + union xfs_btree_irec low; + union xfs_btree_irec high; + struct xfs_defer_ops dfops; + xfs_fsblock_t fsb; + xfs_agblock_t agbno; + int error; + + if (mp->m_sb.sb_agblocks >= XFS_REFC_COW_START) + return -EOPNOTSUPP; + + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + if (error) + return error; + cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL); + + /* Find all the leftover CoW staging extents. */ + INIT_LIST_HEAD(&debris); + memset(&low, 0, sizeof(low)); + memset(&high, 0, sizeof(high)); + low.rc.rc_startblock = XFS_REFC_COW_START; + high.rc.rc_startblock = -1U; + error = xfs_btree_query_range(cur, &low, &high, + xfs_refcount_recover_extent, &debris); + if (error) + goto out_cursor; + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + xfs_buf_relse(agbp); + + /* Now iterate the list to free the leftovers */ + list_for_each_entry(rr, &debris, rr_list) { + /* Set up transaction. */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); + if (error) + goto out_free; + + trace_xfs_refcount_recover_extent(mp, agno, &rr->rr_rrec); + + /* Free the orphan record */ + xfs_defer_init(&dfops, &fsb); + agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START; + fsb = XFS_AGB_TO_FSB(mp, agno, agbno); + error = xfs_refcount_free_cow_extent(mp, &dfops, fsb, + rr->rr_rrec.rc_blockcount); + if (error) + goto out_defer; + + /* Free the block. */ + xfs_bmap_add_free(mp, &dfops, fsb, + rr->rr_rrec.rc_blockcount, NULL); + + error = xfs_defer_finish(&tp, &dfops, NULL); + if (error) + goto out_defer; + + error = xfs_trans_commit(tp); + if (error) + goto out_free; + } + +out_free: + /* Free the leftover list */ + list_for_each_entry_safe(rr, n, &debris, rr_list) { + list_del(&rr->rr_list); + kmem_free(rr); + } + return error; + +out_cursor: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + xfs_buf_relse(agbp); + goto out_free; + +out_defer: + xfs_defer_cancel(&dfops); + xfs_trans_cancel(tp); + goto out_free; +} diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h new file mode 100644 index 000000000000..098dc668ab2c --- /dev/null +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_REFCOUNT_H__ +#define __XFS_REFCOUNT_H__ + +extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur, + xfs_agblock_t bno, int *stat); +extern int xfs_refcount_lookup_ge(struct xfs_btree_cur *cur, + xfs_agblock_t bno, int *stat); +extern int xfs_refcount_get_rec(struct xfs_btree_cur *cur, + struct xfs_refcount_irec *irec, int *stat); + +enum xfs_refcount_intent_type { + XFS_REFCOUNT_INCREASE = 1, + XFS_REFCOUNT_DECREASE, + XFS_REFCOUNT_ALLOC_COW, + XFS_REFCOUNT_FREE_COW, +}; + +struct xfs_refcount_intent { + struct list_head ri_list; + enum xfs_refcount_intent_type ri_type; + xfs_fsblock_t ri_startblock; + xfs_extlen_t ri_blockcount; +}; + +extern int xfs_refcount_increase_extent(struct xfs_mount *mp, + struct xfs_defer_ops *dfops, struct xfs_bmbt_irec *irec); +extern int xfs_refcount_decrease_extent(struct xfs_mount *mp, + struct xfs_defer_ops *dfops, struct xfs_bmbt_irec *irec); + +extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp, + struct xfs_btree_cur *rcur, int error); +extern int xfs_refcount_finish_one(struct xfs_trans *tp, + struct xfs_defer_ops *dfops, enum xfs_refcount_intent_type type, + xfs_fsblock_t startblock, xfs_extlen_t blockcount, + xfs_fsblock_t *new_fsb, xfs_extlen_t *new_len, + struct xfs_btree_cur **pcur); + +extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur, + xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, + xfs_extlen_t *flen, bool find_end_of_shared); + +extern int xfs_refcount_alloc_cow_extent(struct xfs_mount *mp, + struct xfs_defer_ops *dfops, xfs_fsblock_t fsb, + xfs_extlen_t len); +extern int xfs_refcount_free_cow_extent(struct xfs_mount *mp, + struct xfs_defer_ops *dfops, xfs_fsblock_t fsb, + xfs_extlen_t len); +extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, + xfs_agnumber_t agno); + +#endif /* __XFS_REFCOUNT_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c new file mode 100644 index 000000000000..453bb2757ec2 --- /dev/null +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -0,0 +1,451 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_bmap.h" +#include "xfs_refcount_btree.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" +#include "xfs_bit.h" +#include "xfs_rmap.h" + +static struct xfs_btree_cur * +xfs_refcountbt_dup_cursor( + struct xfs_btree_cur *cur) +{ + return xfs_refcountbt_init_cursor(cur->bc_mp, cur->bc_tp, + cur->bc_private.a.agbp, cur->bc_private.a.agno, + cur->bc_private.a.dfops); +} + +STATIC void +xfs_refcountbt_set_root( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int inc) +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); + struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); + + ASSERT(ptr->s != 0); + + agf->agf_refcount_root = ptr->s; + be32_add_cpu(&agf->agf_refcount_level, inc); + pag->pagf_refcount_level += inc; + xfs_perag_put(pag); + + xfs_alloc_log_agf(cur->bc_tp, agbp, + XFS_AGF_REFCOUNT_ROOT | XFS_AGF_REFCOUNT_LEVEL); +} + +STATIC int +xfs_refcountbt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_alloc_arg args; /* block allocation args */ + int error; /* error return value */ + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + + memset(&args, 0, sizeof(args)); + args.tp = cur->bc_tp; + args.mp = cur->bc_mp; + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, + xfs_refc_block(args.mp)); + args.firstblock = args.fsbno; + xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_REFC); + args.minlen = args.maxlen = args.prod = 1; + args.resv = XFS_AG_RESV_METADATA; + + error = xfs_alloc_vextent(&args); + if (error) + goto out_error; + trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno, + args.agbno, 1); + if (args.fsbno == NULLFSBLOCK) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + ASSERT(args.agno == cur->bc_private.a.agno); + ASSERT(args.len == 1); + + new->s = cpu_to_be32(args.agbno); + be32_add_cpu(&agf->agf_refcount_blocks, 1); + xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + +out_error: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +STATIC int +xfs_refcountbt_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); + struct xfs_owner_info oinfo; + int error; + + trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_private.a.agno, + XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1); + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_REFC); + be32_add_cpu(&agf->agf_refcount_blocks, -1); + xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); + error = xfs_free_extent(cur->bc_tp, fsbno, 1, &oinfo, + XFS_AG_RESV_METADATA); + if (error) + return error; + + return error; +} + +STATIC int +xfs_refcountbt_get_minrecs( + struct xfs_btree_cur *cur, + int level) +{ + return cur->bc_mp->m_refc_mnr[level != 0]; +} + +STATIC int +xfs_refcountbt_get_maxrecs( + struct xfs_btree_cur *cur, + int level) +{ + return cur->bc_mp->m_refc_mxr[level != 0]; +} + +STATIC void +xfs_refcountbt_init_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + key->refc.rc_startblock = rec->refc.rc_startblock; +} + +STATIC void +xfs_refcountbt_init_high_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + __u32 x; + + x = be32_to_cpu(rec->refc.rc_startblock); + x += be32_to_cpu(rec->refc.rc_blockcount) - 1; + key->refc.rc_startblock = cpu_to_be32(x); +} + +STATIC void +xfs_refcountbt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + rec->refc.rc_startblock = cpu_to_be32(cur->bc_rec.rc.rc_startblock); + rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount); + rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount); +} + +STATIC void +xfs_refcountbt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + + ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(agf->agf_refcount_root != 0); + + ptr->s = agf->agf_refcount_root; +} + +STATIC __int64_t +xfs_refcountbt_key_diff( + struct xfs_btree_cur *cur, + union xfs_btree_key *key) +{ + struct xfs_refcount_irec *rec = &cur->bc_rec.rc; + struct xfs_refcount_key *kp = &key->refc; + + return (__int64_t)be32_to_cpu(kp->rc_startblock) - rec->rc_startblock; +} + +STATIC __int64_t +xfs_refcountbt_diff_two_keys( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + return (__int64_t)be32_to_cpu(k1->refc.rc_startblock) - + be32_to_cpu(k2->refc.rc_startblock); +} + +STATIC bool +xfs_refcountbt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_perag *pag = bp->b_pag; + unsigned int level; + + if (block->bb_magic != cpu_to_be32(XFS_REFC_CRC_MAGIC)) + return false; + + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return false; + if (!xfs_btree_sblock_v5hdr_verify(bp)) + return false; + + level = be16_to_cpu(block->bb_level); + if (pag && pag->pagf_init) { + if (level >= pag->pagf_refcount_level) + return false; + } else if (level >= mp->m_refc_maxlevels) + return false; + + return xfs_btree_sblock_verify(bp, mp->m_refc_mxr[level != 0]); +} + +STATIC void +xfs_refcountbt_read_verify( + struct xfs_buf *bp) +{ + if (!xfs_btree_sblock_verify_crc(bp)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (!xfs_refcountbt_verify(bp)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp); + } +} + +STATIC void +xfs_refcountbt_write_verify( + struct xfs_buf *bp) +{ + if (!xfs_refcountbt_verify(bp)) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + xfs_btree_sblock_calc_crc(bp); + +} + +const struct xfs_buf_ops xfs_refcountbt_buf_ops = { + .name = "xfs_refcountbt", + .verify_read = xfs_refcountbt_read_verify, + .verify_write = xfs_refcountbt_write_verify, +}; + +#if defined(DEBUG) || defined(XFS_WARN) +STATIC int +xfs_refcountbt_keys_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + return be32_to_cpu(k1->refc.rc_startblock) < + be32_to_cpu(k2->refc.rc_startblock); +} + +STATIC int +xfs_refcountbt_recs_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_rec *r1, + union xfs_btree_rec *r2) +{ + return be32_to_cpu(r1->refc.rc_startblock) + + be32_to_cpu(r1->refc.rc_blockcount) <= + be32_to_cpu(r2->refc.rc_startblock); +} +#endif + +static const struct xfs_btree_ops xfs_refcountbt_ops = { + .rec_len = sizeof(struct xfs_refcount_rec), + .key_len = sizeof(struct xfs_refcount_key), + + .dup_cursor = xfs_refcountbt_dup_cursor, + .set_root = xfs_refcountbt_set_root, + .alloc_block = xfs_refcountbt_alloc_block, + .free_block = xfs_refcountbt_free_block, + .get_minrecs = xfs_refcountbt_get_minrecs, + .get_maxrecs = xfs_refcountbt_get_maxrecs, + .init_key_from_rec = xfs_refcountbt_init_key_from_rec, + .init_high_key_from_rec = xfs_refcountbt_init_high_key_from_rec, + .init_rec_from_cur = xfs_refcountbt_init_rec_from_cur, + .init_ptr_from_cur = xfs_refcountbt_init_ptr_from_cur, + .key_diff = xfs_refcountbt_key_diff, + .buf_ops = &xfs_refcountbt_buf_ops, + .diff_two_keys = xfs_refcountbt_diff_two_keys, +#if defined(DEBUG) || defined(XFS_WARN) + .keys_inorder = xfs_refcountbt_keys_inorder, + .recs_inorder = xfs_refcountbt_recs_inorder, +#endif +}; + +/* + * Allocate a new refcount btree cursor. + */ +struct xfs_btree_cur * +xfs_refcountbt_init_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agnumber_t agno, + struct xfs_defer_ops *dfops) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_btree_cur *cur; + + ASSERT(agno != NULLAGNUMBER); + ASSERT(agno < mp->m_sb.sb_agcount); + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); + + cur->bc_tp = tp; + cur->bc_mp = mp; + cur->bc_btnum = XFS_BTNUM_REFC; + cur->bc_blocklog = mp->m_sb.sb_blocklog; + cur->bc_ops = &xfs_refcountbt_ops; + + cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level); + + cur->bc_private.a.agbp = agbp; + cur->bc_private.a.agno = agno; + cur->bc_private.a.dfops = dfops; + cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; + + cur->bc_private.a.priv.refc.nr_ops = 0; + cur->bc_private.a.priv.refc.shape_changes = 0; + + return cur; +} + +/* + * Calculate the number of records in a refcount btree block. + */ +int +xfs_refcountbt_maxrecs( + struct xfs_mount *mp, + int blocklen, + bool leaf) +{ + blocklen -= XFS_REFCOUNT_BLOCK_LEN; + + if (leaf) + return blocklen / sizeof(struct xfs_refcount_rec); + return blocklen / (sizeof(struct xfs_refcount_key) + + sizeof(xfs_refcount_ptr_t)); +} + +/* Compute the maximum height of a refcount btree. */ +void +xfs_refcountbt_compute_maxlevels( + struct xfs_mount *mp) +{ + mp->m_refc_maxlevels = xfs_btree_compute_maxlevels(mp, + mp->m_refc_mnr, mp->m_sb.sb_agblocks); +} + +/* Calculate the refcount btree size for some records. */ +xfs_extlen_t +xfs_refcountbt_calc_size( + struct xfs_mount *mp, + unsigned long long len) +{ + return xfs_btree_calc_size(mp, mp->m_refc_mnr, len); +} + +/* + * Calculate the maximum refcount btree size. + */ +xfs_extlen_t +xfs_refcountbt_max_size( + struct xfs_mount *mp) +{ + /* Bail out if we're uninitialized, which can happen in mkfs. */ + if (mp->m_refc_mxr[0] == 0) + return 0; + + return xfs_refcountbt_calc_size(mp, mp->m_sb.sb_agblocks); +} + +/* + * Figure out how many blocks to reserve and how many are used by this btree. + */ +int +xfs_refcountbt_calc_reserves( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_extlen_t *ask, + xfs_extlen_t *used) +{ + struct xfs_buf *agbp; + struct xfs_agf *agf; + xfs_extlen_t tree_len; + int error; + + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return 0; + + *ask += xfs_refcountbt_max_size(mp); + + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + if (error) + return error; + + agf = XFS_BUF_TO_AGF(agbp); + tree_len = be32_to_cpu(agf->agf_refcount_blocks); + xfs_buf_relse(agbp); + + *used += tree_len; + + return error; +} diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h new file mode 100644 index 000000000000..3be7768bd51a --- /dev/null +++ b/fs/xfs/libxfs/xfs_refcount_btree.h @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_REFCOUNT_BTREE_H__ +#define __XFS_REFCOUNT_BTREE_H__ + +/* + * Reference Count Btree on-disk structures + */ + +struct xfs_buf; +struct xfs_btree_cur; +struct xfs_mount; + +/* + * Btree block header size + */ +#define XFS_REFCOUNT_BLOCK_LEN XFS_BTREE_SBLOCK_CRC_LEN + +/* + * Record, key, and pointer address macros for btree blocks. + * + * (note that some of these may appear unused, but they are used in userspace) + */ +#define XFS_REFCOUNT_REC_ADDR(block, index) \ + ((struct xfs_refcount_rec *) \ + ((char *)(block) + \ + XFS_REFCOUNT_BLOCK_LEN + \ + (((index) - 1) * sizeof(struct xfs_refcount_rec)))) + +#define XFS_REFCOUNT_KEY_ADDR(block, index) \ + ((struct xfs_refcount_key *) \ + ((char *)(block) + \ + XFS_REFCOUNT_BLOCK_LEN + \ + ((index) - 1) * sizeof(struct xfs_refcount_key))) + +#define XFS_REFCOUNT_PTR_ADDR(block, index, maxrecs) \ + ((xfs_refcount_ptr_t *) \ + ((char *)(block) + \ + XFS_REFCOUNT_BLOCK_LEN + \ + (maxrecs) * sizeof(struct xfs_refcount_key) + \ + ((index) - 1) * sizeof(xfs_refcount_ptr_t))) + +extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp, + struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agnumber_t agno, + struct xfs_defer_ops *dfops); +extern int xfs_refcountbt_maxrecs(struct xfs_mount *mp, int blocklen, + bool leaf); +extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp); + +extern xfs_extlen_t xfs_refcountbt_calc_size(struct xfs_mount *mp, + unsigned long long len); +extern xfs_extlen_t xfs_refcountbt_max_size(struct xfs_mount *mp); + +extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp, + xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used); + +#endif /* __XFS_REFCOUNT_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 73d05407d663..3a8cc7139912 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -148,6 +148,37 @@ done: return error; } +STATIC int +xfs_rmap_delete( + struct xfs_btree_cur *rcur, + xfs_agblock_t agbno, + xfs_extlen_t len, + uint64_t owner, + uint64_t offset, + unsigned int flags) +{ + int i; + int error; + + trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_private.a.agno, agbno, + len, owner, offset, flags); + + error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done); + + error = xfs_btree_delete(rcur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done); +done: + if (error) + trace_xfs_rmap_delete_error(rcur->bc_mp, + rcur->bc_private.a.agno, error, _RET_IP_); + return error; +} + static int xfs_rmap_btrec_to_irec( union xfs_btree_rec *rec, @@ -180,6 +211,160 @@ xfs_rmap_get_rec( return xfs_rmap_btrec_to_irec(rec, irec); } +struct xfs_find_left_neighbor_info { + struct xfs_rmap_irec high; + struct xfs_rmap_irec *irec; + int *stat; +}; + +/* For each rmap given, figure out if it matches the key we want. */ +STATIC int +xfs_rmap_find_left_neighbor_helper( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *rec, + void *priv) +{ + struct xfs_find_left_neighbor_info *info = priv; + + trace_xfs_rmap_find_left_neighbor_candidate(cur->bc_mp, + cur->bc_private.a.agno, rec->rm_startblock, + rec->rm_blockcount, rec->rm_owner, rec->rm_offset, + rec->rm_flags); + + if (rec->rm_owner != info->high.rm_owner) + return XFS_BTREE_QUERY_RANGE_CONTINUE; + if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) && + !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) && + rec->rm_offset + rec->rm_blockcount - 1 != info->high.rm_offset) + return XFS_BTREE_QUERY_RANGE_CONTINUE; + + *info->irec = *rec; + *info->stat = 1; + return XFS_BTREE_QUERY_RANGE_ABORT; +} + +/* + * Find the record to the left of the given extent, being careful only to + * return a match with the same owner and adjacent physical and logical + * block ranges. + */ +int +xfs_rmap_find_left_neighbor( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + uint64_t owner, + uint64_t offset, + unsigned int flags, + struct xfs_rmap_irec *irec, + int *stat) +{ + struct xfs_find_left_neighbor_info info; + int error; + + *stat = 0; + if (bno == 0) + return 0; + info.high.rm_startblock = bno - 1; + info.high.rm_owner = owner; + if (!XFS_RMAP_NON_INODE_OWNER(owner) && + !(flags & XFS_RMAP_BMBT_BLOCK)) { + if (offset == 0) + return 0; + info.high.rm_offset = offset - 1; + } else + info.high.rm_offset = 0; + info.high.rm_flags = flags; + info.high.rm_blockcount = 0; + info.irec = irec; + info.stat = stat; + + trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp, + cur->bc_private.a.agno, bno, 0, owner, offset, flags); + + error = xfs_rmap_query_range(cur, &info.high, &info.high, + xfs_rmap_find_left_neighbor_helper, &info); + if (error == XFS_BTREE_QUERY_RANGE_ABORT) + error = 0; + if (*stat) + trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, + cur->bc_private.a.agno, irec->rm_startblock, + irec->rm_blockcount, irec->rm_owner, + irec->rm_offset, irec->rm_flags); + return error; +} + +/* For each rmap given, figure out if it matches the key we want. */ +STATIC int +xfs_rmap_lookup_le_range_helper( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *rec, + void *priv) +{ + struct xfs_find_left_neighbor_info *info = priv; + + trace_xfs_rmap_lookup_le_range_candidate(cur->bc_mp, + cur->bc_private.a.agno, rec->rm_startblock, + rec->rm_blockcount, rec->rm_owner, rec->rm_offset, + rec->rm_flags); + + if (rec->rm_owner != info->high.rm_owner) + return XFS_BTREE_QUERY_RANGE_CONTINUE; + if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) && + !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) && + (rec->rm_offset > info->high.rm_offset || + rec->rm_offset + rec->rm_blockcount <= info->high.rm_offset)) + return XFS_BTREE_QUERY_RANGE_CONTINUE; + + *info->irec = *rec; + *info->stat = 1; + return XFS_BTREE_QUERY_RANGE_ABORT; +} + +/* + * Find the record to the left of the given extent, being careful only to + * return a match with the same owner and overlapping physical and logical + * block ranges. This is the overlapping-interval version of + * xfs_rmap_lookup_le. + */ +int +xfs_rmap_lookup_le_range( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + uint64_t owner, + uint64_t offset, + unsigned int flags, + struct xfs_rmap_irec *irec, + int *stat) +{ + struct xfs_find_left_neighbor_info info; + int error; + + info.high.rm_startblock = bno; + info.high.rm_owner = owner; + if (!XFS_RMAP_NON_INODE_OWNER(owner) && !(flags & XFS_RMAP_BMBT_BLOCK)) + info.high.rm_offset = offset; + else + info.high.rm_offset = 0; + info.high.rm_flags = flags; + info.high.rm_blockcount = 0; + *stat = 0; + info.irec = irec; + info.stat = stat; + + trace_xfs_rmap_lookup_le_range(cur->bc_mp, + cur->bc_private.a.agno, bno, 0, owner, offset, flags); + error = xfs_rmap_query_range(cur, &info.high, &info.high, + xfs_rmap_lookup_le_range_helper, &info); + if (error == XFS_BTREE_QUERY_RANGE_ABORT) + error = 0; + if (*stat) + trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, + cur->bc_private.a.agno, irec->rm_startblock, + irec->rm_blockcount, irec->rm_owner, + irec->rm_offset, irec->rm_flags); + return error; +} + /* * Find the extent in the rmap btree and remove it. * @@ -1093,11 +1278,704 @@ done: return error; } +/* + * Convert an unwritten extent to a real extent or vice versa. If there is no + * possibility of overlapping extents, delegate to the simpler convert + * function. + */ +STATIC int +xfs_rmap_convert_shared( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool unwritten, + struct xfs_owner_info *oinfo) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rmap_irec r[4]; /* neighbor extent entries */ + /* left is 0, right is 1, prev is 2 */ + /* new is 3 */ + uint64_t owner; + uint64_t offset; + uint64_t new_endoff; + unsigned int oldext; + unsigned int newext; + unsigned int flags = 0; + int i; + int state = 0; + int error; + + xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); + ASSERT(!(XFS_RMAP_NON_INODE_OWNER(owner) || + (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))); + oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0; + new_endoff = offset + len; + trace_xfs_rmap_convert(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); + + /* + * For the initial lookup, look for and exact match or the left-adjacent + * record for our insertion point. This will also give us the record for + * start block contiguity tests. + */ + error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, flags, + &PREV, &i); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + + ASSERT(PREV.rm_offset <= offset); + ASSERT(PREV.rm_offset + PREV.rm_blockcount >= new_endoff); + ASSERT((PREV.rm_flags & XFS_RMAP_UNWRITTEN) == oldext); + newext = ~oldext & XFS_RMAP_UNWRITTEN; + + /* + * Set flags determining what part of the previous oldext allocation + * extent is being replaced by a newext allocation. + */ + if (PREV.rm_offset == offset) + state |= RMAP_LEFT_FILLING; + if (PREV.rm_offset + PREV.rm_blockcount == new_endoff) + state |= RMAP_RIGHT_FILLING; + + /* Is there a left record that abuts our range? */ + error = xfs_rmap_find_left_neighbor(cur, bno, owner, offset, newext, + &LEFT, &i); + if (error) + goto done; + if (i) { + state |= RMAP_LEFT_VALID; + XFS_WANT_CORRUPTED_GOTO(mp, + LEFT.rm_startblock + LEFT.rm_blockcount <= bno, + done); + if (xfs_rmap_is_mergeable(&LEFT, owner, newext)) + state |= RMAP_LEFT_CONTIG; + } + + /* Is there a right record that abuts our range? */ + error = xfs_rmap_lookup_eq(cur, bno + len, len, owner, offset + len, + newext, &i); + if (error) + goto done; + if (i) { + state |= RMAP_RIGHT_VALID; + error = xfs_rmap_get_rec(cur, &RIGHT, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= RIGHT.rm_startblock, + done); + trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, + cur->bc_private.a.agno, RIGHT.rm_startblock, + RIGHT.rm_blockcount, RIGHT.rm_owner, + RIGHT.rm_offset, RIGHT.rm_flags); + if (xfs_rmap_is_mergeable(&RIGHT, owner, newext)) + state |= RMAP_RIGHT_CONTIG; + } + + /* check that left + prev + right is not too long */ + if ((state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | + RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) == + (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | + RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG) && + (unsigned long)LEFT.rm_blockcount + len + + RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX) + state &= ~RMAP_RIGHT_CONTIG; + + trace_xfs_rmap_convert_state(mp, cur->bc_private.a.agno, state, + _RET_IP_); + /* + * Switch out based on the FILLING and CONTIG state bits. + */ + switch (state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | + RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) { + case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | + RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG: + /* + * Setting all of a previous oldext extent to newext. + * The left and right neighbors are both contiguous with new. + */ + error = xfs_rmap_delete(cur, RIGHT.rm_startblock, + RIGHT.rm_blockcount, RIGHT.rm_owner, + RIGHT.rm_offset, RIGHT.rm_flags); + if (error) + goto done; + error = xfs_rmap_delete(cur, PREV.rm_startblock, + PREV.rm_blockcount, PREV.rm_owner, + PREV.rm_offset, PREV.rm_flags); + if (error) + goto done; + NEW = LEFT; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_blockcount += PREV.rm_blockcount + RIGHT.rm_blockcount; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG: + /* + * Setting all of a previous oldext extent to newext. + * The left neighbor is contiguous, the right is not. + */ + error = xfs_rmap_delete(cur, PREV.rm_startblock, + PREV.rm_blockcount, PREV.rm_owner, + PREV.rm_offset, PREV.rm_flags); + if (error) + goto done; + NEW = LEFT; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_blockcount += PREV.rm_blockcount; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG: + /* + * Setting all of a previous oldext extent to newext. + * The right neighbor is contiguous, the left is not. + */ + error = xfs_rmap_delete(cur, RIGHT.rm_startblock, + RIGHT.rm_blockcount, RIGHT.rm_owner, + RIGHT.rm_offset, RIGHT.rm_flags); + if (error) + goto done; + NEW = PREV; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_blockcount += RIGHT.rm_blockcount; + NEW.rm_flags = RIGHT.rm_flags; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING: + /* + * Setting all of a previous oldext extent to newext. + * Neither the left nor right neighbors are contiguous with + * the new one. + */ + NEW = PREV; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_flags = newext; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG: + /* + * Setting the first part of a previous oldext extent to newext. + * The left neighbor is contiguous. + */ + NEW = PREV; + error = xfs_rmap_delete(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags); + if (error) + goto done; + NEW.rm_offset += len; + NEW.rm_startblock += len; + NEW.rm_blockcount -= len; + error = xfs_rmap_insert(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags); + if (error) + goto done; + NEW = LEFT; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_blockcount += len; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING: + /* + * Setting the first part of a previous oldext extent to newext. + * The left neighbor is not contiguous. + */ + NEW = PREV; + error = xfs_rmap_delete(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags); + if (error) + goto done; + NEW.rm_offset += len; + NEW.rm_startblock += len; + NEW.rm_blockcount -= len; + error = xfs_rmap_insert(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags); + if (error) + goto done; + error = xfs_rmap_insert(cur, bno, len, owner, offset, newext); + if (error) + goto done; + break; + + case RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG: + /* + * Setting the last part of a previous oldext extent to newext. + * The right neighbor is contiguous with the new allocation. + */ + NEW = PREV; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_blockcount = offset - NEW.rm_offset; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + NEW = RIGHT; + error = xfs_rmap_delete(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags); + if (error) + goto done; + NEW.rm_offset = offset; + NEW.rm_startblock = bno; + NEW.rm_blockcount += len; + error = xfs_rmap_insert(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags); + if (error) + goto done; + break; + + case RMAP_RIGHT_FILLING: + /* + * Setting the last part of a previous oldext extent to newext. + * The right neighbor is not contiguous. + */ + NEW = PREV; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_blockcount -= len; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + error = xfs_rmap_insert(cur, bno, len, owner, offset, newext); + if (error) + goto done; + break; + + case 0: + /* + * Setting the middle part of a previous oldext extent to + * newext. Contiguity is impossible here. + * One extent becomes three extents. + */ + /* new right extent - oldext */ + NEW.rm_startblock = bno + len; + NEW.rm_owner = owner; + NEW.rm_offset = new_endoff; + NEW.rm_blockcount = PREV.rm_offset + PREV.rm_blockcount - + new_endoff; + NEW.rm_flags = PREV.rm_flags; + error = xfs_rmap_insert(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset, + NEW.rm_flags); + if (error) + goto done; + /* new left extent - oldext */ + NEW = PREV; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_blockcount = offset - NEW.rm_offset; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + /* new middle extent - newext */ + NEW.rm_startblock = bno; + NEW.rm_blockcount = len; + NEW.rm_owner = owner; + NEW.rm_offset = offset; + NEW.rm_flags = newext; + error = xfs_rmap_insert(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset, + NEW.rm_flags); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG: + case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG: + case RMAP_LEFT_FILLING | RMAP_RIGHT_CONTIG: + case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG: + case RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG: + case RMAP_LEFT_CONTIG: + case RMAP_RIGHT_CONTIG: + /* + * These cases are all impossible. + */ + ASSERT(0); + } + + trace_xfs_rmap_convert_done(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); +done: + if (error) + trace_xfs_rmap_convert_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + #undef NEW #undef LEFT #undef RIGHT #undef PREV +/* + * Find an extent in the rmap btree and unmap it. For rmap extent types that + * can overlap (data fork rmaps on reflink filesystems) we must be careful + * that the prev/next records in the btree might belong to another owner. + * Therefore we must use delete+insert to alter any of the key fields. + * + * For every other situation there can only be one owner for a given extent, + * so we can call the regular _free function. + */ +STATIC int +xfs_rmap_unmap_shared( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool unwritten, + struct xfs_owner_info *oinfo) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rmap_irec ltrec; + uint64_t ltoff; + int error = 0; + int i; + uint64_t owner; + uint64_t offset; + unsigned int flags; + + xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); + if (unwritten) + flags |= XFS_RMAP_UNWRITTEN; + trace_xfs_rmap_unmap(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); + + /* + * We should always have a left record because there's a static record + * for the AG headers at rm_startblock == 0 created by mkfs/growfs that + * will not ever be removed from the tree. + */ + error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, flags, + <rec, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + ltoff = ltrec.rm_offset; + + /* Make sure the extent we found covers the entire freeing range. */ + XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_startblock <= bno && + ltrec.rm_startblock + ltrec.rm_blockcount >= + bno + len, out_error); + + /* Make sure the owner matches what we expect to find in the tree. */ + XFS_WANT_CORRUPTED_GOTO(mp, owner == ltrec.rm_owner, out_error); + + /* Make sure the unwritten flag matches. */ + XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) == + (ltrec.rm_flags & XFS_RMAP_UNWRITTEN), out_error); + + /* Check the offset. */ + XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_offset <= offset, out_error); + XFS_WANT_CORRUPTED_GOTO(mp, offset <= ltoff + ltrec.rm_blockcount, + out_error); + + if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) { + /* Exact match, simply remove the record from rmap tree. */ + error = xfs_rmap_delete(cur, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, + ltrec.rm_offset, ltrec.rm_flags); + if (error) + goto out_error; + } else if (ltrec.rm_startblock == bno) { + /* + * Overlap left hand side of extent: move the start, trim the + * length and update the current record. + * + * ltbno ltlen + * Orig: |oooooooooooooooooooo| + * Freeing: |fffffffff| + * Result: |rrrrrrrrrr| + * bno len + */ + + /* Delete prev rmap. */ + error = xfs_rmap_delete(cur, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, + ltrec.rm_offset, ltrec.rm_flags); + if (error) + goto out_error; + + /* Add an rmap at the new offset. */ + ltrec.rm_startblock += len; + ltrec.rm_blockcount -= len; + ltrec.rm_offset += len; + error = xfs_rmap_insert(cur, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, + ltrec.rm_offset, ltrec.rm_flags); + if (error) + goto out_error; + } else if (ltrec.rm_startblock + ltrec.rm_blockcount == bno + len) { + /* + * Overlap right hand side of extent: trim the length and + * update the current record. + * + * ltbno ltlen + * Orig: |oooooooooooooooooooo| + * Freeing: |fffffffff| + * Result: |rrrrrrrrrr| + * bno len + */ + error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, + ltrec.rm_offset, ltrec.rm_flags, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + ltrec.rm_blockcount -= len; + error = xfs_rmap_update(cur, <rec); + if (error) + goto out_error; + } else { + /* + * Overlap middle of extent: trim the length of the existing + * record to the length of the new left-extent size, increment + * the insertion position so we can insert a new record + * containing the remaining right-extent space. + * + * ltbno ltlen + * Orig: |oooooooooooooooooooo| + * Freeing: |fffffffff| + * Result: |rrrrr| |rrrr| + * bno len + */ + xfs_extlen_t orig_len = ltrec.rm_blockcount; + + /* Shrink the left side of the rmap */ + error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, + ltrec.rm_offset, ltrec.rm_flags, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + ltrec.rm_blockcount = bno - ltrec.rm_startblock; + error = xfs_rmap_update(cur, <rec); + if (error) + goto out_error; + + /* Add an rmap at the new offset */ + error = xfs_rmap_insert(cur, bno + len, + orig_len - len - ltrec.rm_blockcount, + ltrec.rm_owner, offset + len, + ltrec.rm_flags); + if (error) + goto out_error; + } + + trace_xfs_rmap_unmap_done(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); +out_error: + if (error) + trace_xfs_rmap_unmap_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Find an extent in the rmap btree and map it. For rmap extent types that + * can overlap (data fork rmaps on reflink filesystems) we must be careful + * that the prev/next records in the btree might belong to another owner. + * Therefore we must use delete+insert to alter any of the key fields. + * + * For every other situation there can only be one owner for a given extent, + * so we can call the regular _alloc function. + */ +STATIC int +xfs_rmap_map_shared( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool unwritten, + struct xfs_owner_info *oinfo) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rmap_irec ltrec; + struct xfs_rmap_irec gtrec; + int have_gt; + int have_lt; + int error = 0; + int i; + uint64_t owner; + uint64_t offset; + unsigned int flags = 0; + + xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); + if (unwritten) + flags |= XFS_RMAP_UNWRITTEN; + trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); + + /* Is there a left record that abuts our range? */ + error = xfs_rmap_find_left_neighbor(cur, bno, owner, offset, flags, + <rec, &have_lt); + if (error) + goto out_error; + if (have_lt && + !xfs_rmap_is_mergeable(<rec, owner, flags)) + have_lt = 0; + + /* Is there a right record that abuts our range? */ + error = xfs_rmap_lookup_eq(cur, bno + len, len, owner, offset + len, + flags, &have_gt); + if (error) + goto out_error; + if (have_gt) { + error = xfs_rmap_get_rec(cur, >rec, &have_gt); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, have_gt == 1, out_error); + trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, + cur->bc_private.a.agno, gtrec.rm_startblock, + gtrec.rm_blockcount, gtrec.rm_owner, + gtrec.rm_offset, gtrec.rm_flags); + + if (!xfs_rmap_is_mergeable(>rec, owner, flags)) + have_gt = 0; + } + + if (have_lt && + ltrec.rm_startblock + ltrec.rm_blockcount == bno && + ltrec.rm_offset + ltrec.rm_blockcount == offset) { + /* + * Left edge contiguous, merge into left record. + * + * ltbno ltlen + * orig: |ooooooooo| + * adding: |aaaaaaaaa| + * result: |rrrrrrrrrrrrrrrrrrr| + * bno len + */ + ltrec.rm_blockcount += len; + if (have_gt && + bno + len == gtrec.rm_startblock && + offset + len == gtrec.rm_offset) { + /* + * Right edge also contiguous, delete right record + * and merge into left record. + * + * ltbno ltlen gtbno gtlen + * orig: |ooooooooo| |ooooooooo| + * adding: |aaaaaaaaa| + * result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr| + */ + ltrec.rm_blockcount += gtrec.rm_blockcount; + error = xfs_rmap_delete(cur, gtrec.rm_startblock, + gtrec.rm_blockcount, gtrec.rm_owner, + gtrec.rm_offset, gtrec.rm_flags); + if (error) + goto out_error; + } + + /* Point the cursor back to the left record and update. */ + error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, + ltrec.rm_offset, ltrec.rm_flags, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + + error = xfs_rmap_update(cur, <rec); + if (error) + goto out_error; + } else if (have_gt && + bno + len == gtrec.rm_startblock && + offset + len == gtrec.rm_offset) { + /* + * Right edge contiguous, merge into right record. + * + * gtbno gtlen + * Orig: |ooooooooo| + * adding: |aaaaaaaaa| + * Result: |rrrrrrrrrrrrrrrrrrr| + * bno len + */ + /* Delete the old record. */ + error = xfs_rmap_delete(cur, gtrec.rm_startblock, + gtrec.rm_blockcount, gtrec.rm_owner, + gtrec.rm_offset, gtrec.rm_flags); + if (error) + goto out_error; + + /* Move the start and re-add it. */ + gtrec.rm_startblock = bno; + gtrec.rm_blockcount += len; + gtrec.rm_offset = offset; + error = xfs_rmap_insert(cur, gtrec.rm_startblock, + gtrec.rm_blockcount, gtrec.rm_owner, + gtrec.rm_offset, gtrec.rm_flags); + if (error) + goto out_error; + } else { + /* + * No contiguous edge with identical owner, insert + * new record at current cursor position. + */ + error = xfs_rmap_insert(cur, bno, len, owner, offset, flags); + if (error) + goto out_error; + } + + trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); +out_error: + if (error) + trace_xfs_rmap_map_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + struct xfs_rmap_query_range_info { xfs_rmap_query_range_fn fn; void *priv; @@ -1237,15 +2115,27 @@ xfs_rmap_finish_one( case XFS_RMAP_MAP: error = xfs_rmap_map(rcur, bno, blockcount, unwritten, &oinfo); break; + case XFS_RMAP_MAP_SHARED: + error = xfs_rmap_map_shared(rcur, bno, blockcount, unwritten, + &oinfo); + break; case XFS_RMAP_FREE: case XFS_RMAP_UNMAP: error = xfs_rmap_unmap(rcur, bno, blockcount, unwritten, &oinfo); break; + case XFS_RMAP_UNMAP_SHARED: + error = xfs_rmap_unmap_shared(rcur, bno, blockcount, unwritten, + &oinfo); + break; case XFS_RMAP_CONVERT: error = xfs_rmap_convert(rcur, bno, blockcount, !unwritten, &oinfo); break; + case XFS_RMAP_CONVERT_SHARED: + error = xfs_rmap_convert_shared(rcur, bno, blockcount, + !unwritten, &oinfo); + break; default: ASSERT(0); error = -EFSCORRUPTED; @@ -1263,9 +2153,10 @@ out_cur: */ static bool xfs_rmap_update_is_needed( - struct xfs_mount *mp) + struct xfs_mount *mp, + int whichfork) { - return xfs_sb_version_hasrmapbt(&mp->m_sb); + return xfs_sb_version_hasrmapbt(&mp->m_sb) && whichfork != XFS_COW_FORK; } /* @@ -1311,10 +2202,11 @@ xfs_rmap_map_extent( int whichfork, struct xfs_bmbt_irec *PREV) { - if (!xfs_rmap_update_is_needed(mp)) + if (!xfs_rmap_update_is_needed(mp, whichfork)) return 0; - return __xfs_rmap_add(mp, dfops, XFS_RMAP_MAP, ip->i_ino, + return __xfs_rmap_add(mp, dfops, xfs_is_reflink_inode(ip) ? + XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino, whichfork, PREV); } @@ -1327,10 +2219,11 @@ xfs_rmap_unmap_extent( int whichfork, struct xfs_bmbt_irec *PREV) { - if (!xfs_rmap_update_is_needed(mp)) + if (!xfs_rmap_update_is_needed(mp, whichfork)) return 0; - return __xfs_rmap_add(mp, dfops, XFS_RMAP_UNMAP, ip->i_ino, + return __xfs_rmap_add(mp, dfops, xfs_is_reflink_inode(ip) ? + XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino, whichfork, PREV); } @@ -1343,10 +2236,11 @@ xfs_rmap_convert_extent( int whichfork, struct xfs_bmbt_irec *PREV) { - if (!xfs_rmap_update_is_needed(mp)) + if (!xfs_rmap_update_is_needed(mp, whichfork)) return 0; - return __xfs_rmap_add(mp, dfops, XFS_RMAP_CONVERT, ip->i_ino, + return __xfs_rmap_add(mp, dfops, xfs_is_reflink_inode(ip) ? + XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino, whichfork, PREV); } @@ -1362,7 +2256,7 @@ xfs_rmap_alloc_extent( { struct xfs_bmbt_irec bmap; - if (!xfs_rmap_update_is_needed(mp)) + if (!xfs_rmap_update_is_needed(mp, XFS_DATA_FORK)) return 0; bmap.br_startblock = XFS_AGB_TO_FSB(mp, agno, bno); @@ -1386,7 +2280,7 @@ xfs_rmap_free_extent( { struct xfs_bmbt_irec bmap; - if (!xfs_rmap_update_is_needed(mp)) + if (!xfs_rmap_update_is_needed(mp, XFS_DATA_FORK)) return 0; bmap.br_startblock = XFS_AGB_TO_FSB(mp, agno, bno); diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 71cf99a4acba..789930599339 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -206,4 +206,11 @@ int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type, xfs_fsblock_t startblock, xfs_filblks_t blockcount, xfs_exntst_t state, struct xfs_btree_cur **pcur); +int xfs_rmap_find_left_neighbor(struct xfs_btree_cur *cur, xfs_agblock_t bno, + uint64_t owner, uint64_t offset, unsigned int flags, + struct xfs_rmap_irec *irec, int *stat); +int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno, + uint64_t owner, uint64_t offset, unsigned int flags, + struct xfs_rmap_irec *irec, int *stat); + #endif /* __XFS_RMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 17b8eeb34ac8..83e672ff7577 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -35,6 +35,7 @@ #include "xfs_cksum.h" #include "xfs_error.h" #include "xfs_extent_busy.h" +#include "xfs_ag_resv.h" /* * Reverse map btree. @@ -512,6 +513,83 @@ void xfs_rmapbt_compute_maxlevels( struct xfs_mount *mp) { - mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(mp, - mp->m_rmap_mnr, mp->m_sb.sb_agblocks); + /* + * On a non-reflink filesystem, the maximum number of rmap + * records is the number of blocks in the AG, hence the max + * rmapbt height is log_$maxrecs($agblocks). However, with + * reflink each AG block can have up to 2^32 (per the refcount + * record format) owners, which means that theoretically we + * could face up to 2^64 rmap records. + * + * That effectively means that the max rmapbt height must be + * XFS_BTREE_MAXLEVELS. "Fortunately" we'll run out of AG + * blocks to feed the rmapbt long before the rmapbt reaches + * maximum height. The reflink code uses ag_resv_critical to + * disallow reflinking when less than 10% of the per-AG metadata + * block reservation since the fallback is a regular file copy. + */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) + mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS; + else + mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(mp, + mp->m_rmap_mnr, mp->m_sb.sb_agblocks); +} + +/* Calculate the refcount btree size for some records. */ +xfs_extlen_t +xfs_rmapbt_calc_size( + struct xfs_mount *mp, + unsigned long long len) +{ + return xfs_btree_calc_size(mp, mp->m_rmap_mnr, len); +} + +/* + * Calculate the maximum refcount btree size. + */ +xfs_extlen_t +xfs_rmapbt_max_size( + struct xfs_mount *mp) +{ + /* Bail out if we're uninitialized, which can happen in mkfs. */ + if (mp->m_rmap_mxr[0] == 0) + return 0; + + return xfs_rmapbt_calc_size(mp, mp->m_sb.sb_agblocks); +} + +/* + * Figure out how many blocks to reserve and how many are used by this btree. + */ +int +xfs_rmapbt_calc_reserves( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_extlen_t *ask, + xfs_extlen_t *used) +{ + struct xfs_buf *agbp; + struct xfs_agf *agf; + xfs_extlen_t pool_len; + xfs_extlen_t tree_len; + int error; + + if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + return 0; + + /* Reserve 1% of the AG or enough for 1 block per record. */ + pool_len = max(mp->m_sb.sb_agblocks / 100, xfs_rmapbt_max_size(mp)); + *ask += pool_len; + + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + if (error) + return error; + + agf = XFS_BUF_TO_AGF(agbp); + tree_len = be32_to_cpu(agf->agf_rmap_blocks); + xfs_buf_relse(agbp); + + *used += tree_len; + + return error; } diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h index e73a55357dab..2a9ac472fb15 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.h +++ b/fs/xfs/libxfs/xfs_rmap_btree.h @@ -58,4 +58,11 @@ struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp, int xfs_rmapbt_maxrecs(struct xfs_mount *mp, int blocklen, int leaf); extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp); +extern xfs_extlen_t xfs_rmapbt_calc_size(struct xfs_mount *mp, + unsigned long long len); +extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp); + +extern int xfs_rmapbt_calc_reserves(struct xfs_mount *mp, + xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used); + #endif /* __XFS_RMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 4aecc5fefe96..a70aec910626 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -38,6 +38,8 @@ #include "xfs_ialloc_btree.h" #include "xfs_log.h" #include "xfs_rmap_btree.h" +#include "xfs_bmap.h" +#include "xfs_refcount_btree.h" /* * Physical superblock buffer manipulations. Shared with libxfs in userspace. @@ -737,6 +739,13 @@ xfs_sb_mount_common( mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2; mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2; + mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, + true); + mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, + false); + mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2; + mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2; + mp->m_bsize = XFS_FSB_TO_BB(mp, 1); mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK, sbp->sb_inopblock); diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 0c5b30bd884c..c6f4eb46fe26 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -39,6 +39,7 @@ extern const struct xfs_buf_ops xfs_agf_buf_ops; extern const struct xfs_buf_ops xfs_agfl_buf_ops; extern const struct xfs_buf_ops xfs_allocbt_buf_ops; extern const struct xfs_buf_ops xfs_rmapbt_buf_ops; +extern const struct xfs_buf_ops xfs_refcountbt_buf_ops; extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops; extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops; extern const struct xfs_buf_ops xfs_bmbt_buf_ops; @@ -122,6 +123,7 @@ int xfs_log_calc_minimum_size(struct xfs_mount *); #define XFS_INO_REF 2 #define XFS_ATTR_BTREE_REF 1 #define XFS_DQUOT_REF 1 +#define XFS_REFC_BTREE_REF 1 /* * Flags for xfs_trans_ichgtime(). diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 301ef2f4dbd6..b456cca1bfb2 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -67,13 +67,14 @@ xfs_calc_buf_res( * Per-extent log reservation for the btree changes involved in freeing or * allocating an extent. In classic XFS there were two trees that will be * modified (bnobt + cntbt). With rmap enabled, there are three trees - * (rmapbt). The number of blocks reserved is based on the formula: + * (rmapbt). With reflink, there are four trees (refcountbt). The number of + * blocks reserved is based on the formula: * * num trees * ((2 blocks/level * max depth) - 1) * * Keep in mind that max depth is calculated separately for each type of tree. */ -static uint +uint xfs_allocfree_log_count( struct xfs_mount *mp, uint num_ops) @@ -83,6 +84,8 @@ xfs_allocfree_log_count( blocks = num_ops * 2 * (2 * mp->m_ag_maxlevels - 1); if (xfs_sb_version_hasrmapbt(&mp->m_sb)) blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1); + if (xfs_sb_version_hasreflink(&mp->m_sb)) + blocks += num_ops * (2 * mp->m_refc_maxlevels - 1); return blocks; } @@ -809,11 +812,18 @@ xfs_trans_resv_calc( * require a permanent reservation on space. */ resp->tr_write.tr_logres = xfs_calc_write_reservation(mp); - resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; + if (xfs_sb_version_hasreflink(&mp->m_sb)) + resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; + else + resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp); - resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; + if (xfs_sb_version_hasreflink(&mp->m_sb)) + resp->tr_itruncate.tr_logcount = + XFS_ITRUNCATE_LOG_COUNT_REFLINK; + else + resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp); @@ -870,7 +880,10 @@ xfs_trans_resv_calc( resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp); - resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; + if (xfs_sb_version_hasreflink(&mp->m_sb)) + resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; + else + resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; /* diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index 0eb46ed6d404..b7e5357d060a 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -87,6 +87,7 @@ struct xfs_trans_resv { #define XFS_DEFAULT_LOG_COUNT 1 #define XFS_DEFAULT_PERM_LOG_COUNT 2 #define XFS_ITRUNCATE_LOG_COUNT 2 +#define XFS_ITRUNCATE_LOG_COUNT_REFLINK 8 #define XFS_INACTIVE_LOG_COUNT 2 #define XFS_CREATE_LOG_COUNT 2 #define XFS_CREATE_TMPFILE_LOG_COUNT 2 @@ -96,11 +97,13 @@ struct xfs_trans_resv { #define XFS_LINK_LOG_COUNT 2 #define XFS_RENAME_LOG_COUNT 2 #define XFS_WRITE_LOG_COUNT 2 +#define XFS_WRITE_LOG_COUNT_REFLINK 8 #define XFS_ADDAFORK_LOG_COUNT 2 #define XFS_ATTRINVAL_LOG_COUNT 1 #define XFS_ATTRSET_LOG_COUNT 3 #define XFS_ATTRRM_LOG_COUNT 3 void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp); +uint xfs_allocfree_log_count(struct xfs_mount *mp, uint num_ops); #endif /* __XFS_TRANS_RESV_H__ */ diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index 41e0428d8175..7917f6e44286 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -21,6 +21,8 @@ /* * Components of space reservations. */ +#define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) \ + (((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0])) #define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) \ (((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0])) #define XFS_EXTENTADD_SPACE_RES(mp,w) (XFS_BM_MAXLEVELS(mp,w) - 1) @@ -28,6 +30,13 @@ (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \ XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \ XFS_EXTENTADD_SPACE_RES(mp,w)) +#define XFS_SWAP_RMAP_SPACE_RES(mp,b,w)\ + (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \ + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \ + XFS_EXTENTADD_SPACE_RES(mp,w) + \ + ((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \ + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \ + (mp)->m_rmap_maxlevels) #define XFS_DAENTER_1B(mp,w) \ ((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1) #define XFS_DAENTER_DBS(mp,w) \ diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 3d503647f26b..8d74870468c2 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -90,6 +90,7 @@ typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */ */ #define XFS_DATA_FORK 0 #define XFS_ATTR_FORK 1 +#define XFS_COW_FORK 2 /* * Min numbers of data/attr fork btree root pointers. @@ -109,7 +110,7 @@ typedef enum { typedef enum { XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi, - XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_MAX + XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_REFCi, XFS_BTNUM_MAX } xfs_btnum_t; struct xfs_name { diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 4a28fa91e3b1..3e57a56cf829 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -31,6 +31,7 @@ #include "xfs_bmap.h" #include "xfs_bmap_util.h" #include "xfs_bmap_btree.h" +#include "xfs_reflink.h" #include <linux/gfp.h> #include <linux/mpage.h> #include <linux/pagevec.h> @@ -39,6 +40,7 @@ /* flags for direct write completions */ #define XFS_DIO_FLAG_UNWRITTEN (1 << 0) #define XFS_DIO_FLAG_APPEND (1 << 1) +#define XFS_DIO_FLAG_COW (1 << 2) /* * structure owned by writepages passed to individual writepage calls @@ -287,6 +289,25 @@ xfs_end_io( error = -EIO; /* + * For a CoW extent, we need to move the mapping from the CoW fork + * to the data fork. If instead an error happened, just dump the + * new blocks. + */ + if (ioend->io_type == XFS_IO_COW) { + if (error) + goto done; + if (ioend->io_bio->bi_error) { + error = xfs_reflink_cancel_cow_range(ip, + ioend->io_offset, ioend->io_size); + goto done; + } + error = xfs_reflink_end_cow(ip, ioend->io_offset, + ioend->io_size); + if (error) + goto done; + } + + /* * For unwritten extents we need to issue transactions to convert a * range to normal written extens after the data I/O has finished. * Detecting and handling completion IO errors is done individually @@ -301,7 +322,8 @@ xfs_end_io( } else if (ioend->io_append_trans) { error = xfs_setfilesize_ioend(ioend, error); } else { - ASSERT(!xfs_ioend_is_append(ioend)); + ASSERT(!xfs_ioend_is_append(ioend) || + ioend->io_type == XFS_IO_COW); } done: @@ -315,7 +337,7 @@ xfs_end_bio( struct xfs_ioend *ioend = bio->bi_private; struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; - if (ioend->io_type == XFS_IO_UNWRITTEN) + if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW) queue_work(mp->m_unwritten_workqueue, &ioend->io_work); else if (ioend->io_append_trans) queue_work(mp->m_data_workqueue, &ioend->io_work); @@ -341,6 +363,7 @@ xfs_map_blocks( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; + ASSERT(type != XFS_IO_COW); if (type == XFS_IO_UNWRITTEN) bmapi_flags |= XFS_BMAPI_IGSTATE; @@ -355,6 +378,13 @@ xfs_map_blocks( offset_fsb = XFS_B_TO_FSBT(mp, offset); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, imap, &nimaps, bmapi_flags); + /* + * Truncate an overwrite extent if there's a pending CoW + * reservation before the end of this extent. This forces us + * to come back to writepage to take care of the CoW. + */ + if (nimaps && type == XFS_IO_OVERWRITE) + xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb, imap); xfs_iunlock(ip, XFS_ILOCK_SHARED); if (error) @@ -362,7 +392,8 @@ xfs_map_blocks( if (type == XFS_IO_DELALLOC && (!nimaps || isnullstartblock(imap->br_startblock))) { - error = xfs_iomap_write_allocate(ip, offset, imap); + error = xfs_iomap_write_allocate(ip, XFS_DATA_FORK, offset, + imap); if (!error) trace_xfs_map_blocks_alloc(ip, offset, count, type, imap); return error; @@ -737,6 +768,56 @@ out_invalidate: return; } +static int +xfs_map_cow( + struct xfs_writepage_ctx *wpc, + struct inode *inode, + loff_t offset, + unsigned int *new_type) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_bmbt_irec imap; + bool is_cow = false, need_alloc = false; + int error; + + /* + * If we already have a valid COW mapping keep using it. + */ + if (wpc->io_type == XFS_IO_COW) { + wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset); + if (wpc->imap_valid) { + *new_type = XFS_IO_COW; + return 0; + } + } + + /* + * Else we need to check if there is a COW mapping at this offset. + */ + xfs_ilock(ip, XFS_ILOCK_SHARED); + is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap, &need_alloc); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!is_cow) + return 0; + + /* + * And if the COW mapping has a delayed extent here we need to + * allocate real space for it now. + */ + if (need_alloc) { + error = xfs_iomap_write_allocate(ip, XFS_COW_FORK, offset, + &imap); + if (error) + return error; + } + + wpc->io_type = *new_type = XFS_IO_COW; + wpc->imap_valid = true; + wpc->imap = imap; + return 0; +} + /* * We implement an immediate ioend submission policy here to avoid needing to * chain multiple ioends and hence nest mempool allocations which can violate @@ -769,6 +850,7 @@ xfs_writepage_map( int error = 0; int count = 0; int uptodate = 1; + unsigned int new_type; bh = head = page_buffers(page); offset = page_offset(page); @@ -789,22 +871,13 @@ xfs_writepage_map( continue; } - if (buffer_unwritten(bh)) { - if (wpc->io_type != XFS_IO_UNWRITTEN) { - wpc->io_type = XFS_IO_UNWRITTEN; - wpc->imap_valid = false; - } - } else if (buffer_delay(bh)) { - if (wpc->io_type != XFS_IO_DELALLOC) { - wpc->io_type = XFS_IO_DELALLOC; - wpc->imap_valid = false; - } - } else if (buffer_uptodate(bh)) { - if (wpc->io_type != XFS_IO_OVERWRITE) { - wpc->io_type = XFS_IO_OVERWRITE; - wpc->imap_valid = false; - } - } else { + if (buffer_unwritten(bh)) + new_type = XFS_IO_UNWRITTEN; + else if (buffer_delay(bh)) + new_type = XFS_IO_DELALLOC; + else if (buffer_uptodate(bh)) + new_type = XFS_IO_OVERWRITE; + else { if (PageUptodate(page)) ASSERT(buffer_mapped(bh)); /* @@ -817,6 +890,17 @@ xfs_writepage_map( continue; } + if (xfs_is_reflink_inode(XFS_I(inode))) { + error = xfs_map_cow(wpc, inode, offset, &new_type); + if (error) + goto out; + } + + if (wpc->io_type != new_type) { + wpc->io_type = new_type; + wpc->imap_valid = false; + } + if (wpc->imap_valid) wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset); @@ -1107,18 +1191,24 @@ xfs_map_direct( struct inode *inode, struct buffer_head *bh_result, struct xfs_bmbt_irec *imap, - xfs_off_t offset) + xfs_off_t offset, + bool is_cow) { uintptr_t *flags = (uintptr_t *)&bh_result->b_private; xfs_off_t size = bh_result->b_size; trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size, - ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, imap); + ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : is_cow ? XFS_IO_COW : + XFS_IO_OVERWRITE, imap); if (ISUNWRITTEN(imap)) { *flags |= XFS_DIO_FLAG_UNWRITTEN; set_buffer_defer_completion(bh_result); - } else if (offset + size > i_size_read(inode) || offset + size < 0) { + } else if (is_cow) { + *flags |= XFS_DIO_FLAG_COW; + set_buffer_defer_completion(bh_result); + } + if (offset + size > i_size_read(inode) || offset + size < 0) { *flags |= XFS_DIO_FLAG_APPEND; set_buffer_defer_completion(bh_result); } @@ -1164,6 +1254,44 @@ xfs_map_trim_size( bh_result->b_size = mapping_size; } +/* Bounce unaligned directio writes to the page cache. */ +static int +xfs_bounce_unaligned_dio_write( + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, + struct xfs_bmbt_irec *imap) +{ + struct xfs_bmbt_irec irec; + xfs_fileoff_t delta; + bool shared; + bool x; + int error; + + irec = *imap; + if (offset_fsb > irec.br_startoff) { + delta = offset_fsb - irec.br_startoff; + irec.br_blockcount -= delta; + irec.br_startblock += delta; + irec.br_startoff = offset_fsb; + } + error = xfs_reflink_trim_around_shared(ip, &irec, &shared, &x); + if (error) + return error; + + /* + * We're here because we're trying to do a directio write to a + * region that isn't aligned to a filesystem block. If any part + * of the extent is shared, fall back to buffered mode to handle + * the RMW. This is done by returning -EREMCHG ("remote addr + * changed"), which is caught further up the call stack. + */ + if (shared) { + trace_xfs_reflink_bounce_dio_write(ip, imap); + return -EREMCHG; + } + return 0; +} + STATIC int __xfs_get_blocks( struct inode *inode, @@ -1183,6 +1311,8 @@ __xfs_get_blocks( xfs_off_t offset; ssize_t size; int new = 0; + bool is_cow = false; + bool need_alloc = false; BUG_ON(create && !direct); @@ -1208,8 +1338,26 @@ __xfs_get_blocks( end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); offset_fsb = XFS_B_TO_FSBT(mp, offset); - error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, - &imap, &nimaps, XFS_BMAPI_ENTIRE); + if (create && direct && xfs_is_reflink_inode(ip)) + is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap, + &need_alloc); + if (!is_cow) { + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, + &imap, &nimaps, XFS_BMAPI_ENTIRE); + /* + * Truncate an overwrite extent if there's a pending CoW + * reservation before the end of this extent. This + * forces us to come back to get_blocks to take care of + * the CoW. + */ + if (create && direct && nimaps && + imap.br_startblock != HOLESTARTBLOCK && + imap.br_startblock != DELAYSTARTBLOCK && + !ISUNWRITTEN(&imap)) + xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb, + &imap); + } + ASSERT(!need_alloc); if (error) goto out_unlock; @@ -1261,6 +1409,13 @@ __xfs_get_blocks( if (imap.br_startblock != HOLESTARTBLOCK && imap.br_startblock != DELAYSTARTBLOCK && (create || !ISUNWRITTEN(&imap))) { + if (create && direct && !is_cow) { + error = xfs_bounce_unaligned_dio_write(ip, offset_fsb, + &imap); + if (error) + return error; + } + xfs_map_buffer(inode, bh_result, &imap, offset); if (ISUNWRITTEN(&imap)) set_buffer_unwritten(bh_result); @@ -1269,7 +1424,8 @@ __xfs_get_blocks( if (dax_fault) ASSERT(!ISUNWRITTEN(&imap)); else - xfs_map_direct(inode, bh_result, &imap, offset); + xfs_map_direct(inode, bh_result, &imap, offset, + is_cow); } } @@ -1391,11 +1547,14 @@ xfs_end_io_direct_write( i_size_write(inode, offset + size); spin_unlock(&ip->i_flags_lock); + if (flags & XFS_DIO_FLAG_COW) + error = xfs_reflink_end_cow(ip, offset, size); if (flags & XFS_DIO_FLAG_UNWRITTEN) { trace_xfs_end_io_direct_write_unwritten(ip, offset, size); error = xfs_iomap_write_unwritten(ip, offset, size); - } else if (flags & XFS_DIO_FLAG_APPEND) { + } + if (flags & XFS_DIO_FLAG_APPEND) { trace_xfs_end_io_direct_write_append(ip, offset, size); error = xfs_setfilesize(ip, offset, size); @@ -1425,6 +1584,17 @@ xfs_vm_bmap( trace_xfs_vm_bmap(XFS_I(inode)); xfs_ilock(ip, XFS_IOLOCK_SHARED); + + /* + * The swap code (ab-)uses ->bmap to get a block mapping and then + * bypasseÑ• the file system for actual I/O. We really can't allow + * that on reflinks inodes, so we have to skip out here. And yes, + * 0 is the magic code for a bmap error.. + */ + if (xfs_is_reflink_inode(ip)) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return 0; + } filemap_write_and_wait(mapping); xfs_iunlock(ip, XFS_IOLOCK_SHARED); return generic_block_bmap(mapping, block, xfs_get_blocks); diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 1950e3bca2ac..b3c6634f9518 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -28,13 +28,15 @@ enum { XFS_IO_DELALLOC, /* covers delalloc region */ XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */ XFS_IO_OVERWRITE, /* covers already allocated extent */ + XFS_IO_COW, /* covers copy-on-write extent */ }; #define XFS_IO_TYPES \ { XFS_IO_INVALID, "invalid" }, \ { XFS_IO_DELALLOC, "delalloc" }, \ { XFS_IO_UNWRITTEN, "unwritten" }, \ - { XFS_IO_OVERWRITE, "overwrite" } + { XFS_IO_OVERWRITE, "overwrite" }, \ + { XFS_IO_COW, "CoW" } /* * Structure for buffered I/O completions. diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c new file mode 100644 index 000000000000..9bf57c76623b --- /dev/null +++ b/fs/xfs/xfs_bmap_item.c @@ -0,0 +1,508 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_buf_item.h" +#include "xfs_bmap_item.h" +#include "xfs_log.h" +#include "xfs_bmap.h" +#include "xfs_icache.h" +#include "xfs_trace.h" + + +kmem_zone_t *xfs_bui_zone; +kmem_zone_t *xfs_bud_zone; + +static inline struct xfs_bui_log_item *BUI_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_bui_log_item, bui_item); +} + +void +xfs_bui_item_free( + struct xfs_bui_log_item *buip) +{ + kmem_zone_free(xfs_bui_zone, buip); +} + +STATIC void +xfs_bui_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + struct xfs_bui_log_item *buip = BUI_ITEM(lip); + + *nvecs += 1; + *nbytes += xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given bui log item. We use only 1 iovec, and we point that + * at the bui_log_format structure embedded in the bui item. + * It is at this point that we assert that all of the extent + * slots in the bui item have been filled. + */ +STATIC void +xfs_bui_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_bui_log_item *buip = BUI_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + ASSERT(atomic_read(&buip->bui_next_extent) == + buip->bui_format.bui_nextents); + + buip->bui_format.bui_type = XFS_LI_BUI; + buip->bui_format.bui_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_BUI_FORMAT, &buip->bui_format, + xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents)); +} + +/* + * Pinning has no meaning for an bui item, so just return. + */ +STATIC void +xfs_bui_item_pin( + struct xfs_log_item *lip) +{ +} + +/* + * The unpin operation is the last place an BUI is manipulated in the log. It is + * either inserted in the AIL or aborted in the event of a log I/O error. In + * either case, the BUI transaction has been successfully committed to make it + * this far. Therefore, we expect whoever committed the BUI to either construct + * and commit the BUD or drop the BUD's reference in the event of error. Simply + * drop the log's BUI reference now that the log is done with it. + */ +STATIC void +xfs_bui_item_unpin( + struct xfs_log_item *lip, + int remove) +{ + struct xfs_bui_log_item *buip = BUI_ITEM(lip); + + xfs_bui_release(buip); +} + +/* + * BUI items have no locking or pushing. However, since BUIs are pulled from + * the AIL when their corresponding BUDs are committed to disk, their situation + * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller + * will eventually flush the log. This should help in getting the BUI out of + * the AIL. + */ +STATIC uint +xfs_bui_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + return XFS_ITEM_PINNED; +} + +/* + * The BUI has been either committed or aborted if the transaction has been + * cancelled. If the transaction was cancelled, an BUD isn't going to be + * constructed and thus we free the BUI here directly. + */ +STATIC void +xfs_bui_item_unlock( + struct xfs_log_item *lip) +{ + if (lip->li_flags & XFS_LI_ABORTED) + xfs_bui_item_free(BUI_ITEM(lip)); +} + +/* + * The BUI is logged only once and cannot be moved in the log, so simply return + * the lsn at which it's been logged. + */ +STATIC xfs_lsn_t +xfs_bui_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + return lsn; +} + +/* + * The BUI dependency tracking op doesn't do squat. It can't because + * it doesn't know where the free extent is coming from. The dependency + * tracking has to be handled by the "enclosing" metadata object. For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +STATIC void +xfs_bui_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all bui log items. + */ +static const struct xfs_item_ops xfs_bui_item_ops = { + .iop_size = xfs_bui_item_size, + .iop_format = xfs_bui_item_format, + .iop_pin = xfs_bui_item_pin, + .iop_unpin = xfs_bui_item_unpin, + .iop_unlock = xfs_bui_item_unlock, + .iop_committed = xfs_bui_item_committed, + .iop_push = xfs_bui_item_push, + .iop_committing = xfs_bui_item_committing, +}; + +/* + * Allocate and initialize an bui item with the given number of extents. + */ +struct xfs_bui_log_item * +xfs_bui_init( + struct xfs_mount *mp) + +{ + struct xfs_bui_log_item *buip; + + buip = kmem_zone_zalloc(xfs_bui_zone, KM_SLEEP); + + xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops); + buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS; + buip->bui_format.bui_id = (uintptr_t)(void *)buip; + atomic_set(&buip->bui_next_extent, 0); + atomic_set(&buip->bui_refcount, 2); + + return buip; +} + +/* + * Freeing the BUI requires that we remove it from the AIL if it has already + * been placed there. However, the BUI may not yet have been placed in the AIL + * when called by xfs_bui_release() from BUD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the BUI. + */ +void +xfs_bui_release( + struct xfs_bui_log_item *buip) +{ + if (atomic_dec_and_test(&buip->bui_refcount)) { + xfs_trans_ail_remove(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR); + xfs_bui_item_free(buip); + } +} + +static inline struct xfs_bud_log_item *BUD_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_bud_log_item, bud_item); +} + +STATIC void +xfs_bud_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 1; + *nbytes += sizeof(struct xfs_bud_log_format); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given bud log item. We use only 1 iovec, and we point that + * at the bud_log_format structure embedded in the bud item. + * It is at this point that we assert that all of the extent + * slots in the bud item have been filled. + */ +STATIC void +xfs_bud_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_bud_log_item *budp = BUD_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + budp->bud_format.bud_type = XFS_LI_BUD; + budp->bud_format.bud_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_BUD_FORMAT, &budp->bud_format, + sizeof(struct xfs_bud_log_format)); +} + +/* + * Pinning has no meaning for an bud item, so just return. + */ +STATIC void +xfs_bud_item_pin( + struct xfs_log_item *lip) +{ +} + +/* + * Since pinning has no meaning for an bud item, unpinning does + * not either. + */ +STATIC void +xfs_bud_item_unpin( + struct xfs_log_item *lip, + int remove) +{ +} + +/* + * There isn't much you can do to push on an bud item. It is simply stuck + * waiting for the log to be flushed to disk. + */ +STATIC uint +xfs_bud_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + return XFS_ITEM_PINNED; +} + +/* + * The BUD is either committed or aborted if the transaction is cancelled. If + * the transaction is cancelled, drop our reference to the BUI and free the + * BUD. + */ +STATIC void +xfs_bud_item_unlock( + struct xfs_log_item *lip) +{ + struct xfs_bud_log_item *budp = BUD_ITEM(lip); + + if (lip->li_flags & XFS_LI_ABORTED) { + xfs_bui_release(budp->bud_buip); + kmem_zone_free(xfs_bud_zone, budp); + } +} + +/* + * When the bud item is committed to disk, all we need to do is delete our + * reference to our partner bui item and then free ourselves. Since we're + * freeing ourselves we must return -1 to keep the transaction code from + * further referencing this item. + */ +STATIC xfs_lsn_t +xfs_bud_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_bud_log_item *budp = BUD_ITEM(lip); + + /* + * Drop the BUI reference regardless of whether the BUD has been + * aborted. Once the BUD transaction is constructed, it is the sole + * responsibility of the BUD to release the BUI (even if the BUI is + * aborted due to log I/O error). + */ + xfs_bui_release(budp->bud_buip); + kmem_zone_free(xfs_bud_zone, budp); + + return (xfs_lsn_t)-1; +} + +/* + * The BUD dependency tracking op doesn't do squat. It can't because + * it doesn't know where the free extent is coming from. The dependency + * tracking has to be handled by the "enclosing" metadata object. For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +STATIC void +xfs_bud_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all bud log items. + */ +static const struct xfs_item_ops xfs_bud_item_ops = { + .iop_size = xfs_bud_item_size, + .iop_format = xfs_bud_item_format, + .iop_pin = xfs_bud_item_pin, + .iop_unpin = xfs_bud_item_unpin, + .iop_unlock = xfs_bud_item_unlock, + .iop_committed = xfs_bud_item_committed, + .iop_push = xfs_bud_item_push, + .iop_committing = xfs_bud_item_committing, +}; + +/* + * Allocate and initialize an bud item with the given number of extents. + */ +struct xfs_bud_log_item * +xfs_bud_init( + struct xfs_mount *mp, + struct xfs_bui_log_item *buip) + +{ + struct xfs_bud_log_item *budp; + + budp = kmem_zone_zalloc(xfs_bud_zone, KM_SLEEP); + xfs_log_item_init(mp, &budp->bud_item, XFS_LI_BUD, &xfs_bud_item_ops); + budp->bud_buip = buip; + budp->bud_format.bud_bui_id = buip->bui_format.bui_id; + + return budp; +} + +/* + * Process a bmap update intent item that was recovered from the log. + * We need to update some inode's bmbt. + */ +int +xfs_bui_recover( + struct xfs_mount *mp, + struct xfs_bui_log_item *buip) +{ + int error = 0; + unsigned int bui_type; + struct xfs_map_extent *bmap; + xfs_fsblock_t startblock_fsb; + xfs_fsblock_t inode_fsb; + bool op_ok; + struct xfs_bud_log_item *budp; + enum xfs_bmap_intent_type type; + int whichfork; + xfs_exntst_t state; + struct xfs_trans *tp; + struct xfs_inode *ip = NULL; + struct xfs_defer_ops dfops; + xfs_fsblock_t firstfsb; + + ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags)); + + /* Only one mapping operation per BUI... */ + if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) { + set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); + xfs_bui_release(buip); + return -EIO; + } + + /* + * First check the validity of the extent described by the + * BUI. If anything is bad, then toss the BUI. + */ + bmap = &buip->bui_format.bui_extents[0]; + startblock_fsb = XFS_BB_TO_FSB(mp, + XFS_FSB_TO_DADDR(mp, bmap->me_startblock)); + inode_fsb = XFS_BB_TO_FSB(mp, XFS_FSB_TO_DADDR(mp, + XFS_INO_TO_FSB(mp, bmap->me_owner))); + switch (bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) { + case XFS_BMAP_MAP: + case XFS_BMAP_UNMAP: + op_ok = true; + break; + default: + op_ok = false; + break; + } + if (!op_ok || startblock_fsb == 0 || + bmap->me_len == 0 || + inode_fsb == 0 || + startblock_fsb >= mp->m_sb.sb_dblocks || + bmap->me_len >= mp->m_sb.sb_agblocks || + inode_fsb >= mp->m_sb.sb_dblocks || + (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS)) { + /* + * This will pull the BUI from the AIL and + * free the memory associated with it. + */ + set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); + xfs_bui_release(buip); + return -EIO; + } + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + if (error) + return error; + budp = xfs_trans_get_bud(tp, buip); + + /* Grab the inode. */ + error = xfs_iget(mp, tp, bmap->me_owner, 0, XFS_ILOCK_EXCL, &ip); + if (error) + goto err_inode; + + if (VFS_I(ip)->i_nlink == 0) + xfs_iflags_set(ip, XFS_IRECOVERY); + xfs_defer_init(&dfops, &firstfsb); + + /* Process deferred bmap item. */ + state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? + XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; + switch (bui_type) { + case XFS_BMAP_MAP: + case XFS_BMAP_UNMAP: + type = bui_type; + break; + default: + error = -EFSCORRUPTED; + goto err_dfops; + } + xfs_trans_ijoin(tp, ip, 0); + + error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type, + ip, whichfork, bmap->me_startoff, + bmap->me_startblock, bmap->me_len, + state); + if (error) + goto err_dfops; + + /* Finish transaction, free inodes. */ + error = xfs_defer_finish(&tp, &dfops, NULL); + if (error) + goto err_dfops; + + set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + IRELE(ip); + + return error; + +err_dfops: + xfs_defer_cancel(&dfops); +err_inode: + xfs_trans_cancel(tp); + if (ip) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + IRELE(ip); + } + return error; +} diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h new file mode 100644 index 000000000000..c867daae4a3c --- /dev/null +++ b/fs/xfs/xfs_bmap_item.h @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_BMAP_ITEM_H__ +#define __XFS_BMAP_ITEM_H__ + +/* + * There are (currently) two pairs of bmap btree redo item types: map & unmap. + * The common abbreviations for these are BUI (bmap update intent) and BUD + * (bmap update done). The redo item type is encoded in the flags field of + * each xfs_map_extent. + * + * *I items should be recorded in the *first* of a series of rolled + * transactions, and the *D items should be recorded in the same transaction + * that records the associated bmbt updates. + * + * Should the system crash after the commit of the first transaction but + * before the commit of the final transaction in a series, log recovery will + * use the redo information recorded by the intent items to replay the + * bmbt metadata updates in the non-first transaction. + */ + +/* kernel only BUI/BUD definitions */ + +struct xfs_mount; +struct kmem_zone; + +/* + * Max number of extents in fast allocation path. + */ +#define XFS_BUI_MAX_FAST_EXTENTS 1 + +/* + * Define BUI flag bits. Manipulated by set/clear/test_bit operators. + */ +#define XFS_BUI_RECOVERED 1 + +/* + * This is the "bmap update intent" log item. It is used to log the fact that + * some reverse mappings need to change. It is used in conjunction with the + * "bmap update done" log item described below. + * + * These log items follow the same rules as struct xfs_efi_log_item; see the + * comments about that structure (in xfs_extfree_item.h) for more details. + */ +struct xfs_bui_log_item { + struct xfs_log_item bui_item; + atomic_t bui_refcount; + atomic_t bui_next_extent; + unsigned long bui_flags; /* misc flags */ + struct xfs_bui_log_format bui_format; +}; + +static inline size_t +xfs_bui_log_item_sizeof( + unsigned int nr) +{ + return offsetof(struct xfs_bui_log_item, bui_format) + + xfs_bui_log_format_sizeof(nr); +} + +/* + * This is the "bmap update done" log item. It is used to log the fact that + * some bmbt updates mentioned in an earlier bui item have been performed. + */ +struct xfs_bud_log_item { + struct xfs_log_item bud_item; + struct xfs_bui_log_item *bud_buip; + struct xfs_bud_log_format bud_format; +}; + +extern struct kmem_zone *xfs_bui_zone; +extern struct kmem_zone *xfs_bud_zone; + +struct xfs_bui_log_item *xfs_bui_init(struct xfs_mount *); +struct xfs_bud_log_item *xfs_bud_init(struct xfs_mount *, + struct xfs_bui_log_item *); +void xfs_bui_item_free(struct xfs_bui_log_item *); +void xfs_bui_release(struct xfs_bui_log_item *); +int xfs_bui_recover(struct xfs_mount *mp, struct xfs_bui_log_item *buip); + +#endif /* __XFS_BMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index e827d657c314..552465e011ec 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -42,6 +42,9 @@ #include "xfs_icache.h" #include "xfs_log.h" #include "xfs_rmap_btree.h" +#include "xfs_iomap.h" +#include "xfs_reflink.h" +#include "xfs_refcount.h" /* Kernel only BMAP related definitions and functions */ @@ -389,11 +392,13 @@ xfs_bmap_count_blocks( STATIC int xfs_getbmapx_fix_eof_hole( xfs_inode_t *ip, /* xfs incore inode pointer */ + int whichfork, struct getbmapx *out, /* output structure */ int prealloced, /* this is a file with * preallocated data space */ __int64_t end, /* last block requested */ - xfs_fsblock_t startblock) + xfs_fsblock_t startblock, + bool moretocome) { __int64_t fixlen; xfs_mount_t *mp; /* file system mount point */ @@ -418,8 +423,9 @@ xfs_getbmapx_fix_eof_hole( else out->bmv_block = xfs_fsb_to_db(ip, startblock); fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset); - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) && + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!moretocome && + xfs_iext_bno_to_ext(ifp, fileblock, &lastx) && (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1)) out->bmv_oflags |= BMV_OF_LAST; } @@ -427,6 +433,81 @@ xfs_getbmapx_fix_eof_hole( return 1; } +/* Adjust the reported bmap around shared/unshared extent transitions. */ +STATIC int +xfs_getbmap_adjust_shared( + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *map, + struct getbmapx *out, + struct xfs_bmbt_irec *next_map) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_agblock_t ebno; + xfs_extlen_t elen; + xfs_extlen_t nlen; + int error; + + next_map->br_startblock = NULLFSBLOCK; + next_map->br_startoff = NULLFILEOFF; + next_map->br_blockcount = 0; + + /* Only written data blocks can be shared. */ + if (!xfs_is_reflink_inode(ip) || whichfork != XFS_DATA_FORK || + map->br_startblock == DELAYSTARTBLOCK || + map->br_startblock == HOLESTARTBLOCK || + ISUNWRITTEN(map)) + return 0; + + agno = XFS_FSB_TO_AGNO(mp, map->br_startblock); + agbno = XFS_FSB_TO_AGBNO(mp, map->br_startblock); + error = xfs_reflink_find_shared(mp, agno, agbno, map->br_blockcount, + &ebno, &elen, true); + if (error) + return error; + + if (ebno == NULLAGBLOCK) { + /* No shared blocks at all. */ + return 0; + } else if (agbno == ebno) { + /* + * Shared extent at (agbno, elen). Shrink the reported + * extent length and prepare to move the start of map[i] + * to agbno+elen, with the aim of (re)formatting the new + * map[i] the next time through the inner loop. + */ + out->bmv_length = XFS_FSB_TO_BB(mp, elen); + out->bmv_oflags |= BMV_OF_SHARED; + if (elen != map->br_blockcount) { + *next_map = *map; + next_map->br_startblock += elen; + next_map->br_startoff += elen; + next_map->br_blockcount -= elen; + } + map->br_blockcount -= elen; + } else { + /* + * There's an unshared extent (agbno, ebno - agbno) + * followed by shared extent at (ebno, elen). Shrink + * the reported extent length to cover only the unshared + * extent and prepare to move up the start of map[i] to + * ebno, with the aim of (re)formatting the new map[i] + * the next time through the inner loop. + */ + *next_map = *map; + nlen = ebno - agbno; + out->bmv_length = XFS_FSB_TO_BB(mp, nlen); + next_map->br_startblock += nlen; + next_map->br_startoff += nlen; + next_map->br_blockcount -= nlen; + map->br_blockcount -= nlen; + } + + return 0; +} + /* * Get inode's extents as described in bmv, and format for output. * Calls formatter to fill the user's buffer until all extents @@ -459,12 +540,28 @@ xfs_getbmap( int iflags; /* interface flags */ int bmapi_flags; /* flags for xfs_bmapi */ int cur_ext = 0; + struct xfs_bmbt_irec inject_map; mp = ip->i_mount; iflags = bmv->bmv_iflags; - whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK; - if (whichfork == XFS_ATTR_FORK) { +#ifndef DEBUG + /* Only allow CoW fork queries if we're debugging. */ + if (iflags & BMV_IF_COWFORK) + return -EINVAL; +#endif + if ((iflags & BMV_IF_ATTRFORK) && (iflags & BMV_IF_COWFORK)) + return -EINVAL; + + if (iflags & BMV_IF_ATTRFORK) + whichfork = XFS_ATTR_FORK; + else if (iflags & BMV_IF_COWFORK) + whichfork = XFS_COW_FORK; + else + whichfork = XFS_DATA_FORK; + + switch (whichfork) { + case XFS_ATTR_FORK: if (XFS_IFORK_Q(ip)) { if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS && ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE && @@ -480,7 +577,20 @@ xfs_getbmap( prealloced = 0; fixlen = 1LL << 32; - } else { + break; + case XFS_COW_FORK: + if (ip->i_cformat != XFS_DINODE_FMT_EXTENTS) + return -EINVAL; + + if (xfs_get_cowextsz_hint(ip)) { + prealloced = 1; + fixlen = mp->m_super->s_maxbytes; + } else { + prealloced = 0; + fixlen = XFS_ISIZE(ip); + } + break; + default: if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && ip->i_d.di_format != XFS_DINODE_FMT_BTREE && ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) @@ -494,6 +604,7 @@ xfs_getbmap( prealloced = 0; fixlen = XFS_ISIZE(ip); } + break; } if (bmv->bmv_length == -1) { @@ -520,7 +631,8 @@ xfs_getbmap( return -ENOMEM; xfs_ilock(ip, XFS_IOLOCK_SHARED); - if (whichfork == XFS_DATA_FORK) { + switch (whichfork) { + case XFS_DATA_FORK: if (!(iflags & BMV_IF_DELALLOC) && (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) { error = filemap_write_and_wait(VFS_I(ip)->i_mapping); @@ -538,8 +650,14 @@ xfs_getbmap( } lock = xfs_ilock_data_map_shared(ip); - } else { + break; + case XFS_COW_FORK: + lock = XFS_ILOCK_SHARED; + xfs_ilock(ip, lock); + break; + case XFS_ATTR_FORK: lock = xfs_ilock_attr_map_shared(ip); + break; } /* @@ -581,7 +699,8 @@ xfs_getbmap( goto out_free_map; ASSERT(nmap <= subnex); - for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) { + for (i = 0; i < nmap && nexleft && bmv->bmv_length && + cur_ext < bmv->bmv_count; i++) { out[cur_ext].bmv_oflags = 0; if (map[i].br_state == XFS_EXT_UNWRITTEN) out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC; @@ -614,9 +733,16 @@ xfs_getbmap( goto out_free_map; } - if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext], - prealloced, bmvend, - map[i].br_startblock)) + /* Is this a shared block? */ + error = xfs_getbmap_adjust_shared(ip, whichfork, + &map[i], &out[cur_ext], &inject_map); + if (error) + goto out_free_map; + + if (!xfs_getbmapx_fix_eof_hole(ip, whichfork, + &out[cur_ext], prealloced, bmvend, + map[i].br_startblock, + inject_map.br_startblock != NULLFSBLOCK)) goto out_free_map; bmv->bmv_offset = @@ -636,11 +762,16 @@ xfs_getbmap( continue; } - nexleft--; + if (inject_map.br_startblock != NULLFSBLOCK) { + map[i] = inject_map; + i--; + } else + nexleft--; bmv->bmv_entries++; cur_ext++; } - } while (nmap && nexleft && bmv->bmv_length); + } while (nmap && nexleft && bmv->bmv_length && + cur_ext < bmv->bmv_count); out_free_map: kmem_free(map); @@ -1433,8 +1564,8 @@ xfs_insert_file_space( */ static int xfs_swap_extents_check_format( - xfs_inode_t *ip, /* target inode */ - xfs_inode_t *tip) /* tmp inode */ + struct xfs_inode *ip, /* target inode */ + struct xfs_inode *tip) /* tmp inode */ { /* Should never get a local format */ @@ -1450,6 +1581,13 @@ xfs_swap_extents_check_format( return -EINVAL; /* + * If we have to use the (expensive) rmap swap method, we can + * handle any number of extents and any format. + */ + if (xfs_sb_version_hasrmapbt(&ip->i_mount->m_sb)) + return 0; + + /* * if the target inode is in extent form and the temp inode is in btree * form then we will end up with the target inode in the wrong format * as we already know there are less extents in the temp inode. @@ -1518,125 +1656,161 @@ xfs_swap_extent_flush( return 0; } -int -xfs_swap_extents( - xfs_inode_t *ip, /* target inode */ - xfs_inode_t *tip, /* tmp inode */ - xfs_swapext_t *sxp) +/* + * Move extents from one file to another, when rmap is enabled. + */ +STATIC int +xfs_swap_extent_rmap( + struct xfs_trans **tpp, + struct xfs_inode *ip, + struct xfs_inode *tip) { - xfs_mount_t *mp = ip->i_mount; - xfs_trans_t *tp; - xfs_bstat_t *sbp = &sxp->sx_stat; - xfs_ifork_t *tempifp, *ifp, *tifp; - int src_log_flags, target_log_flags; - int error = 0; - int aforkblks = 0; - int taforkblks = 0; - __uint64_t tmp; - int lock_flags; - - /* XXX: we can't do this with rmap, will fix later */ - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) - return -EOPNOTSUPP; - - tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); - if (!tempifp) { - error = -ENOMEM; - goto out; - } + struct xfs_bmbt_irec irec; + struct xfs_bmbt_irec uirec; + struct xfs_bmbt_irec tirec; + xfs_fileoff_t offset_fsb; + xfs_fileoff_t end_fsb; + xfs_filblks_t count_fsb; + xfs_fsblock_t firstfsb; + struct xfs_defer_ops dfops; + int error; + xfs_filblks_t ilen; + xfs_filblks_t rlen; + int nimaps; + __uint64_t tip_flags2; /* - * Lock the inodes against other IO, page faults and truncate to - * begin with. Then we can ensure the inodes are flushed and have no - * page cache safely. Once we have done this we can take the ilocks and - * do the rest of the checks. + * If the source file has shared blocks, we must flag the donor + * file as having shared blocks so that we get the shared-block + * rmap functions when we go to fix up the rmaps. The flags + * will be switch for reals later. */ - lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; - xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); - xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL); - - /* Verify that both files have the same format */ - if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) { - error = -EINVAL; - goto out_unlock; - } + tip_flags2 = tip->i_d.di_flags2; + if (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK) + tip->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; + + offset_fsb = 0; + end_fsb = XFS_B_TO_FSB(ip->i_mount, i_size_read(VFS_I(ip))); + count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb); + + while (count_fsb) { + /* Read extent from the donor file */ + nimaps = 1; + error = xfs_bmapi_read(tip, offset_fsb, count_fsb, &tirec, + &nimaps, 0); + if (error) + goto out; + ASSERT(nimaps == 1); + ASSERT(tirec.br_startblock != DELAYSTARTBLOCK); + + trace_xfs_swap_extent_rmap_remap(tip, &tirec); + ilen = tirec.br_blockcount; + + /* Unmap the old blocks in the source file. */ + while (tirec.br_blockcount) { + xfs_defer_init(&dfops, &firstfsb); + trace_xfs_swap_extent_rmap_remap_piece(tip, &tirec); + + /* Read extent from the source file */ + nimaps = 1; + error = xfs_bmapi_read(ip, tirec.br_startoff, + tirec.br_blockcount, &irec, + &nimaps, 0); + if (error) + goto out_defer; + ASSERT(nimaps == 1); + ASSERT(tirec.br_startoff == irec.br_startoff); + trace_xfs_swap_extent_rmap_remap_piece(ip, &irec); + + /* Trim the extent. */ + uirec = tirec; + uirec.br_blockcount = rlen = min_t(xfs_filblks_t, + tirec.br_blockcount, + irec.br_blockcount); + trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec); + + /* Remove the mapping from the donor file. */ + error = xfs_bmap_unmap_extent((*tpp)->t_mountp, &dfops, + tip, &uirec); + if (error) + goto out_defer; - /* Verify both files are either real-time or non-realtime */ - if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) { - error = -EINVAL; - goto out_unlock; - } + /* Remove the mapping from the source file. */ + error = xfs_bmap_unmap_extent((*tpp)->t_mountp, &dfops, + ip, &irec); + if (error) + goto out_defer; - error = xfs_swap_extent_flush(ip); - if (error) - goto out_unlock; - error = xfs_swap_extent_flush(tip); - if (error) - goto out_unlock; + /* Map the donor file's blocks into the source file. */ + error = xfs_bmap_map_extent((*tpp)->t_mountp, &dfops, + ip, &uirec); + if (error) + goto out_defer; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); - if (error) - goto out_unlock; + /* Map the source file's blocks into the donor file. */ + error = xfs_bmap_map_extent((*tpp)->t_mountp, &dfops, + tip, &irec); + if (error) + goto out_defer; - /* - * Lock and join the inodes to the tansaction so that transaction commit - * or cancel will unlock the inodes from this point onwards. - */ - xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); - lock_flags |= XFS_ILOCK_EXCL; - xfs_trans_ijoin(tp, ip, lock_flags); - xfs_trans_ijoin(tp, tip, lock_flags); + error = xfs_defer_finish(tpp, &dfops, ip); + if (error) + goto out_defer; + tirec.br_startoff += rlen; + if (tirec.br_startblock != HOLESTARTBLOCK && + tirec.br_startblock != DELAYSTARTBLOCK) + tirec.br_startblock += rlen; + tirec.br_blockcount -= rlen; + } - /* Verify all data are being swapped */ - if (sxp->sx_offset != 0 || - sxp->sx_length != ip->i_d.di_size || - sxp->sx_length != tip->i_d.di_size) { - error = -EFAULT; - goto out_trans_cancel; + /* Roll on... */ + count_fsb -= ilen; + offset_fsb += ilen; } - trace_xfs_swap_extent_before(ip, 0); - trace_xfs_swap_extent_before(tip, 1); + tip->i_d.di_flags2 = tip_flags2; + return 0; - /* check inode formats now that data is flushed */ - error = xfs_swap_extents_check_format(ip, tip); - if (error) { - xfs_notice(mp, - "%s: inode 0x%llx format is incompatible for exchanging.", - __func__, ip->i_ino); - goto out_trans_cancel; - } +out_defer: + xfs_defer_cancel(&dfops); +out: + trace_xfs_swap_extent_rmap_error(ip, error, _RET_IP_); + tip->i_d.di_flags2 = tip_flags2; + return error; +} + +/* Swap the extents of two files by swapping data forks. */ +STATIC int +xfs_swap_extent_forks( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_inode *tip, + int *src_log_flags, + int *target_log_flags) +{ + struct xfs_ifork tempifp, *ifp, *tifp; + int aforkblks = 0; + int taforkblks = 0; + __uint64_t tmp; + int error; - /* - * Compare the current change & modify times with that - * passed in. If they differ, we abort this swap. - * This is the mechanism used to ensure the calling - * process that the file was not changed out from - * under it. - */ - if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) || - (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || - (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || - (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { - error = -EBUSY; - goto out_trans_cancel; - } /* * Count the number of extended attribute blocks */ if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) && (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { - error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks); + error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, + &aforkblks); if (error) - goto out_trans_cancel; + return error; } if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) && (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, - &taforkblks); + &taforkblks); if (error) - goto out_trans_cancel; + return error; } /* @@ -1645,31 +1819,23 @@ xfs_swap_extents( * buffers, and so the validation done on read will expect the owner * field to be correctly set. Once we change the owners, we can swap the * inode forks. - * - * Note the trickiness in setting the log flags - we set the owner log - * flag on the opposite inode (i.e. the inode we are setting the new - * owner to be) because once we swap the forks and log that, log - * recovery is going to see the fork as owned by the swapped inode, - * not the pre-swapped inodes. */ - src_log_flags = XFS_ILOG_CORE; - target_log_flags = XFS_ILOG_CORE; if (ip->i_d.di_version == 3 && ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { - target_log_flags |= XFS_ILOG_DOWNER; + (*target_log_flags) |= XFS_ILOG_DOWNER; error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, tip->i_ino, NULL); if (error) - goto out_trans_cancel; + return error; } if (tip->i_d.di_version == 3 && tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { - src_log_flags |= XFS_ILOG_DOWNER; + (*src_log_flags) |= XFS_ILOG_DOWNER; error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK, ip->i_ino, NULL); if (error) - goto out_trans_cancel; + return error; } /* @@ -1677,9 +1843,9 @@ xfs_swap_extents( */ ifp = &ip->i_df; tifp = &tip->i_df; - *tempifp = *ifp; /* struct copy */ + tempifp = *ifp; /* struct copy */ *ifp = *tifp; /* struct copy */ - *tifp = *tempifp; /* struct copy */ + *tifp = tempifp; /* struct copy */ /* * Fix the on-disk inode values @@ -1719,12 +1885,12 @@ xfs_swap_extents( ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; } - src_log_flags |= XFS_ILOG_DEXT; + (*src_log_flags) |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: ASSERT(ip->i_d.di_version < 3 || - (src_log_flags & XFS_ILOG_DOWNER)); - src_log_flags |= XFS_ILOG_DBROOT; + (*src_log_flags & XFS_ILOG_DOWNER)); + (*src_log_flags) |= XFS_ILOG_DBROOT; break; } @@ -1738,15 +1904,166 @@ xfs_swap_extents( tifp->if_u1.if_extents = tifp->if_u2.if_inline_ext; } - target_log_flags |= XFS_ILOG_DEXT; + (*target_log_flags) |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: - target_log_flags |= XFS_ILOG_DBROOT; + (*target_log_flags) |= XFS_ILOG_DBROOT; ASSERT(tip->i_d.di_version < 3 || - (target_log_flags & XFS_ILOG_DOWNER)); + (*target_log_flags & XFS_ILOG_DOWNER)); break; } + return 0; +} + +int +xfs_swap_extents( + struct xfs_inode *ip, /* target inode */ + struct xfs_inode *tip, /* tmp inode */ + struct xfs_swapext *sxp) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + struct xfs_bstat *sbp = &sxp->sx_stat; + int src_log_flags, target_log_flags; + int error = 0; + int lock_flags; + struct xfs_ifork *cowfp; + __uint64_t f; + int resblks; + + /* + * Lock the inodes against other IO, page faults and truncate to + * begin with. Then we can ensure the inodes are flushed and have no + * page cache safely. Once we have done this we can take the ilocks and + * do the rest of the checks. + */ + lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); + xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL); + + /* Verify that both files have the same format */ + if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) { + error = -EINVAL; + goto out_unlock; + } + + /* Verify both files are either real-time or non-realtime */ + if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) { + error = -EINVAL; + goto out_unlock; + } + + error = xfs_swap_extent_flush(ip); + if (error) + goto out_unlock; + error = xfs_swap_extent_flush(tip); + if (error) + goto out_unlock; + + /* + * Extent "swapping" with rmap requires a permanent reservation and + * a block reservation because it's really just a remap operation + * performed with log redo items! + */ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + /* + * Conceptually this shouldn't affect the shape of either + * bmbt, but since we atomically move extents one by one, + * we reserve enough space to rebuild both trees. + */ + resblks = XFS_SWAP_RMAP_SPACE_RES(mp, + XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK), + XFS_DATA_FORK) + + XFS_SWAP_RMAP_SPACE_RES(mp, + XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK), + XFS_DATA_FORK); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, + 0, 0, &tp); + } else + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, + 0, 0, &tp); + if (error) + goto out_unlock; + + /* + * Lock and join the inodes to the tansaction so that transaction commit + * or cancel will unlock the inodes from this point onwards. + */ + xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); + lock_flags |= XFS_ILOCK_EXCL; + xfs_trans_ijoin(tp, ip, 0); + xfs_trans_ijoin(tp, tip, 0); + + + /* Verify all data are being swapped */ + if (sxp->sx_offset != 0 || + sxp->sx_length != ip->i_d.di_size || + sxp->sx_length != tip->i_d.di_size) { + error = -EFAULT; + goto out_trans_cancel; + } + + trace_xfs_swap_extent_before(ip, 0); + trace_xfs_swap_extent_before(tip, 1); + + /* check inode formats now that data is flushed */ + error = xfs_swap_extents_check_format(ip, tip); + if (error) { + xfs_notice(mp, + "%s: inode 0x%llx format is incompatible for exchanging.", + __func__, ip->i_ino); + goto out_trans_cancel; + } + + /* + * Compare the current change & modify times with that + * passed in. If they differ, we abort this swap. + * This is the mechanism used to ensure the calling + * process that the file was not changed out from + * under it. + */ + if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) || + (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || + (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || + (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { + error = -EBUSY; + goto out_trans_cancel; + } + + /* + * Note the trickiness in setting the log flags - we set the owner log + * flag on the opposite inode (i.e. the inode we are setting the new + * owner to be) because once we swap the forks and log that, log + * recovery is going to see the fork as owned by the swapped inode, + * not the pre-swapped inodes. + */ + src_log_flags = XFS_ILOG_CORE; + target_log_flags = XFS_ILOG_CORE; + + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + error = xfs_swap_extent_rmap(&tp, ip, tip); + else + error = xfs_swap_extent_forks(tp, ip, tip, &src_log_flags, + &target_log_flags); + if (error) + goto out_trans_cancel; + + /* Do we have to swap reflink flags? */ + if ((ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK) ^ + (tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK)) { + f = ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK; + ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + ip->i_d.di_flags2 |= tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK; + tip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + tip->i_d.di_flags2 |= f & XFS_DIFLAG2_REFLINK; + cowfp = ip->i_cowfp; + ip->i_cowfp = tip->i_cowfp; + tip->i_cowfp = cowfp; + xfs_inode_set_cowblocks_tag(ip); + xfs_inode_set_cowblocks_tag(tip); + } + xfs_trans_log_inode(tp, ip, src_log_flags); xfs_trans_log_inode(tp, tip, target_log_flags); @@ -1761,16 +2078,16 @@ xfs_swap_extents( trace_xfs_swap_extent_after(ip, 0); trace_xfs_swap_extent_after(tip, 1); -out: - kmem_free(tempifp); - return error; -out_unlock: xfs_iunlock(ip, lock_flags); xfs_iunlock(tip, lock_flags); - goto out; + return error; out_trans_cancel: xfs_trans_cancel(tp); - goto out; + +out_unlock: + xfs_iunlock(ip, lock_flags); + xfs_iunlock(tip, lock_flags); + return error; } diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index f44f79996978..29816981b50a 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -84,7 +84,8 @@ xfs_dir2_sf_getdents( sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); + if (dp->i_d.di_size < xfs_dir2_sf_hdr_size(sfp->i8count)) + return -EFSCORRUPTED; /* * If the block number in the offset is out of range, we're done. diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 3d224702fbc0..05f8666733a0 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -92,7 +92,11 @@ extern void xfs_verifier_error(struct xfs_buf *bp); #define XFS_ERRTAG_BMAPIFORMAT 21 #define XFS_ERRTAG_FREE_EXTENT 22 #define XFS_ERRTAG_RMAP_FINISH_ONE 23 -#define XFS_ERRTAG_MAX 24 +#define XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE 24 +#define XFS_ERRTAG_REFCOUNT_FINISH_ONE 25 +#define XFS_ERRTAG_BMAP_FINISH_ONE 26 +#define XFS_ERRTAG_AG_RESV_CRITICAL 27 +#define XFS_ERRTAG_MAX 28 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -121,6 +125,10 @@ extern void xfs_verifier_error(struct xfs_buf *bp); #define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT #define XFS_RANDOM_FREE_EXTENT 1 #define XFS_RANDOM_RMAP_FINISH_ONE 1 +#define XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE 1 +#define XFS_RANDOM_REFCOUNT_FINISH_ONE 1 +#define XFS_RANDOM_BMAP_FINISH_ONE 1 +#define XFS_RANDOM_AG_RESV_CRITICAL 4 #ifdef DEBUG extern int xfs_error_test_active; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 2bc58b3fd37d..a314fc7b56fa 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -38,6 +38,7 @@ #include "xfs_icache.h" #include "xfs_pnfs.h" #include "xfs_iomap.h" +#include "xfs_reflink.h" #include <linux/dcache.h> #include <linux/falloc.h> @@ -634,6 +635,13 @@ xfs_file_dio_aio_write( trace_xfs_file_direct_write(ip, count, iocb->ki_pos); + /* If this is a block-aligned directio CoW, remap immediately. */ + if (xfs_is_reflink_inode(ip) && !unaligned_io) { + ret = xfs_reflink_allocate_cow_range(ip, iocb->ki_pos, count); + if (ret) + goto out; + } + data = *from; ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, xfs_get_blocks_direct, xfs_end_io_direct_write, @@ -735,6 +743,9 @@ write_retry: enospc = xfs_inode_free_quota_eofblocks(ip); if (enospc) goto write_retry; + enospc = xfs_inode_free_quota_cowblocks(ip); + if (enospc) + goto write_retry; } else if (ret == -ENOSPC && !enospc) { struct xfs_eofblocks eofb = {0}; @@ -774,10 +785,20 @@ xfs_file_write_iter( if (IS_DAX(inode)) ret = xfs_file_dax_write(iocb, from); - else if (iocb->ki_flags & IOCB_DIRECT) + else if (iocb->ki_flags & IOCB_DIRECT) { + /* + * Allow a directio write to fall back to a buffered + * write *only* in the case that we're doing a reflink + * CoW. In all other directio scenarios we do not + * allow an operation to fall back to buffered mode. + */ ret = xfs_file_dio_aio_write(iocb, from); - else + if (ret == -EREMCHG) + goto buffered; + } else { +buffered: ret = xfs_file_buffered_aio_write(iocb, from); + } if (ret > 0) { XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); @@ -791,7 +812,7 @@ xfs_file_write_iter( #define XFS_FALLOC_FL_SUPPORTED \ (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \ - FALLOC_FL_INSERT_RANGE) + FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE) STATIC long xfs_file_fallocate( @@ -881,9 +902,15 @@ xfs_file_fallocate( if (mode & FALLOC_FL_ZERO_RANGE) error = xfs_zero_file_space(ip, offset, len); - else + else { + if (mode & FALLOC_FL_UNSHARE_RANGE) { + error = xfs_reflink_unshare(ip, offset, len); + if (error) + goto out_unlock; + } error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_PREALLOC); + } if (error) goto out_unlock; } @@ -920,6 +947,189 @@ out_unlock: return error; } +/* + * Flush all file writes out to disk. + */ +static int +xfs_file_wait_for_io( + struct inode *inode, + loff_t offset, + size_t len) +{ + loff_t rounding; + loff_t ioffset; + loff_t iendoffset; + loff_t bs; + int ret; + + bs = inode->i_sb->s_blocksize; + inode_dio_wait(inode); + + rounding = max_t(xfs_off_t, bs, PAGE_SIZE); + ioffset = round_down(offset, rounding); + iendoffset = round_up(offset + len, rounding) - 1; + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, + iendoffset); + return ret; +} + +/* Hook up to the VFS reflink function */ +STATIC int +xfs_file_share_range( + struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + u64 len, + bool is_dedupe) +{ + struct inode *inode_in; + struct inode *inode_out; + ssize_t ret; + loff_t bs; + loff_t isize; + int same_inode; + loff_t blen; + unsigned int flags = 0; + + inode_in = file_inode(file_in); + inode_out = file_inode(file_out); + bs = inode_out->i_sb->s_blocksize; + + /* Don't touch certain kinds of inodes */ + if (IS_IMMUTABLE(inode_out)) + return -EPERM; + if (IS_SWAPFILE(inode_in) || + IS_SWAPFILE(inode_out)) + return -ETXTBSY; + + /* Reflink only works within this filesystem. */ + if (inode_in->i_sb != inode_out->i_sb) + return -EXDEV; + same_inode = (inode_in->i_ino == inode_out->i_ino); + + /* Don't reflink dirs, pipes, sockets... */ + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) + return -EISDIR; + if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode)) + return -EINVAL; + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) + return -EINVAL; + + /* Don't share DAX file data for now. */ + if (IS_DAX(inode_in) || IS_DAX(inode_out)) + return -EINVAL; + + /* Are we going all the way to the end? */ + isize = i_size_read(inode_in); + if (isize == 0) + return 0; + if (len == 0) + len = isize - pos_in; + + /* Ensure offsets don't wrap and the input is inside i_size */ + if (pos_in + len < pos_in || pos_out + len < pos_out || + pos_in + len > isize) + return -EINVAL; + + /* Don't allow dedupe past EOF in the dest file */ + if (is_dedupe) { + loff_t disize; + + disize = i_size_read(inode_out); + if (pos_out >= disize || pos_out + len > disize) + return -EINVAL; + } + + /* If we're linking to EOF, continue to the block boundary. */ + if (pos_in + len == isize) + blen = ALIGN(isize, bs) - pos_in; + else + blen = len; + + /* Only reflink if we're aligned to block boundaries */ + if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) || + !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs)) + return -EINVAL; + + /* Don't allow overlapped reflink within the same file */ + if (same_inode && pos_out + blen > pos_in && pos_out < pos_in + blen) + return -EINVAL; + + /* Wait for the completion of any pending IOs on srcfile */ + ret = xfs_file_wait_for_io(inode_in, pos_in, len); + if (ret) + goto out; + ret = xfs_file_wait_for_io(inode_out, pos_out, len); + if (ret) + goto out; + + if (is_dedupe) + flags |= XFS_REFLINK_DEDUPE; + ret = xfs_reflink_remap_range(XFS_I(inode_in), pos_in, XFS_I(inode_out), + pos_out, len, flags); + if (ret < 0) + goto out; + +out: + return ret; +} + +STATIC ssize_t +xfs_file_copy_range( + struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + size_t len, + unsigned int flags) +{ + int error; + + error = xfs_file_share_range(file_in, pos_in, file_out, pos_out, + len, false); + if (error) + return error; + return len; +} + +STATIC int +xfs_file_clone_range( + struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + u64 len) +{ + return xfs_file_share_range(file_in, pos_in, file_out, pos_out, + len, false); +} + +#define XFS_MAX_DEDUPE_LEN (16 * 1024 * 1024) +STATIC ssize_t +xfs_file_dedupe_range( + struct file *src_file, + u64 loff, + u64 len, + struct file *dst_file, + u64 dst_loff) +{ + int error; + + /* + * Limit the total length we will dedupe for each operation. + * This is intended to bound the total time spent in this + * ioctl to something sane. + */ + if (len > XFS_MAX_DEDUPE_LEN) + len = XFS_MAX_DEDUPE_LEN; + + error = xfs_file_share_range(src_file, loff, dst_file, dst_loff, + len, true); + if (error) + return error; + return len; +} STATIC int xfs_file_open( @@ -1581,6 +1791,9 @@ const struct file_operations xfs_file_operations = { .fsync = xfs_file_fsync, .get_unmapped_area = thp_get_unmapped_area, .fallocate = xfs_file_fallocate, + .copy_file_range = xfs_file_copy_range, + .clone_file_range = xfs_file_clone_range, + .dedupe_file_range = xfs_file_dedupe_range, }; const struct file_operations xfs_dir_file_operations = { diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 94ac06f3d908..93d12fa2670d 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -43,6 +43,7 @@ #include "xfs_log.h" #include "xfs_filestream.h" #include "xfs_rmap.h" +#include "xfs_ag_resv.h" /* * File system operations @@ -108,7 +109,9 @@ xfs_fs_geometry( (xfs_sb_version_hassparseinodes(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_SPINODES : 0) | (xfs_sb_version_hasrmapbt(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_RMAPBT : 0); + XFS_FSOP_GEOM_FLAGS_RMAPBT : 0) | + (xfs_sb_version_hasreflink(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_REFLINK : 0); geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? mp->m_sb.sb_logsectsize : BBSIZE; geo->rtsectsize = mp->m_sb.sb_blocksize; @@ -259,6 +262,12 @@ xfs_growfs_data_private( agf->agf_longest = cpu_to_be32(tmpsize); if (xfs_sb_version_hascrc(&mp->m_sb)) uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid); + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + agf->agf_refcount_root = cpu_to_be32( + xfs_refc_block(mp)); + agf->agf_refcount_level = cpu_to_be32(1); + agf->agf_refcount_blocks = cpu_to_be32(1); + } error = xfs_bwrite(bp); xfs_buf_relse(bp); @@ -450,6 +459,17 @@ xfs_growfs_data_private( rrec->rm_offset = 0; be16_add_cpu(&block->bb_numrecs, 1); + /* account for refc btree root */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + rrec = XFS_RMAP_REC_ADDR(block, 5); + rrec->rm_startblock = cpu_to_be32( + xfs_refc_block(mp)); + rrec->rm_blockcount = cpu_to_be32(1); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_REFC); + rrec->rm_offset = 0; + be16_add_cpu(&block->bb_numrecs, 1); + } + error = xfs_bwrite(bp); xfs_buf_relse(bp); if (error) @@ -507,6 +527,28 @@ xfs_growfs_data_private( goto error0; } + /* + * refcount btree root block + */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, xfs_refc_block(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_refcountbt_buf_ops); + if (!bp) { + error = -ENOMEM; + goto error0; + } + + xfs_btree_init_block(mp, bp, XFS_REFC_CRC_MAGIC, + 0, 0, agno, + XFS_BTREE_CRC_BLOCKS); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + } } xfs_trans_agblocks_delta(tp, nfree); /* @@ -589,6 +631,11 @@ xfs_growfs_data_private( xfs_set_low_space_thresholds(mp); mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); + /* Reserve AG metadata blocks. */ + error = xfs_fs_reserve_ag_blocks(mp); + if (error && error != -ENOSPC) + goto out; + /* update secondary superblocks. */ for (agno = 1; agno < nagcount; agno++) { error = 0; @@ -639,6 +686,8 @@ xfs_growfs_data_private( continue; } } + + out: return saved_error ? saved_error : error; error0: @@ -948,3 +997,59 @@ xfs_do_force_shutdown( "Please umount the filesystem and rectify the problem(s)"); } } + +/* + * Reserve free space for per-AG metadata. + */ +int +xfs_fs_reserve_ag_blocks( + struct xfs_mount *mp) +{ + xfs_agnumber_t agno; + struct xfs_perag *pag; + int error = 0; + int err2; + + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + pag = xfs_perag_get(mp, agno); + err2 = xfs_ag_resv_init(pag); + xfs_perag_put(pag); + if (err2 && !error) + error = err2; + } + + if (error && error != -ENOSPC) { + xfs_warn(mp, + "Error %d reserving per-AG metadata reserve pool.", error); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + } + + return error; +} + +/* + * Free space reserved for per-AG metadata. + */ +int +xfs_fs_unreserve_ag_blocks( + struct xfs_mount *mp) +{ + xfs_agnumber_t agno; + struct xfs_perag *pag; + int error = 0; + int err2; + + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + pag = xfs_perag_get(mp, agno); + err2 = xfs_ag_resv_free(pag); + xfs_perag_put(pag); + if (err2 && !error) + error = err2; + } + + if (error) + xfs_warn(mp, + "Error %d freeing per-AG metadata reserve pool.", error); + + return error; +} diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index f32713f14f9a..f34915898fea 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -26,4 +26,7 @@ extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, xfs_fsop_resblks_t *outval); extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); +extern int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp); +extern int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp); + #endif /* __XFS_FSOPS_H__ */ diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 4d41b241298f..687a4b01fc53 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -21,8 +21,8 @@ /* * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n, * other XFS code uses these values. Times are measured in centisecs (i.e. - * 100ths of a second) with the exception of eofb_timer, which is measured in - * seconds. + * 100ths of a second) with the exception of eofb_timer and cowb_timer, which + * are measured in seconds. */ xfs_param_t xfs_params = { /* MIN DFLT MAX */ @@ -42,6 +42,7 @@ xfs_param_t xfs_params = { .inherit_nodfrg = { 0, 1, 1 }, .fstrm_timer = { 1, 30*100, 3600*100}, .eofb_timer = { 1, 300, 3600*24}, + .cowb_timer = { 1, 1800, 3600*24}, }; struct xfs_globals xfs_globals = { diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 65b2e3f85f52..14796b744e0a 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -33,6 +33,7 @@ #include "xfs_bmap_util.h" #include "xfs_dquot_item.h" #include "xfs_dquot.h" +#include "xfs_reflink.h" #include <linux/kthread.h> #include <linux/freezer.h> @@ -76,6 +77,9 @@ xfs_inode_alloc( ip->i_mount = mp; memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); ip->i_afp = NULL; + ip->i_cowfp = NULL; + ip->i_cnextents = 0; + ip->i_cformat = XFS_DINODE_FMT_EXTENTS; memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); ip->i_flags = 0; ip->i_delayed_blks = 0; @@ -101,6 +105,8 @@ xfs_inode_free_callback( if (ip->i_afp) xfs_idestroy_fork(ip, XFS_ATTR_FORK); + if (ip->i_cowfp) + xfs_idestroy_fork(ip, XFS_COW_FORK); if (ip->i_itemp) { ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL)); @@ -787,6 +793,33 @@ xfs_eofblocks_worker( xfs_queue_eofblocks(mp); } +/* + * Background scanning to trim preallocated CoW space. This is queued + * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default). + * (We'll just piggyback on the post-EOF prealloc space workqueue.) + */ +STATIC void +xfs_queue_cowblocks( + struct xfs_mount *mp) +{ + rcu_read_lock(); + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_COWBLOCKS_TAG)) + queue_delayed_work(mp->m_eofblocks_workqueue, + &mp->m_cowblocks_work, + msecs_to_jiffies(xfs_cowb_secs * 1000)); + rcu_read_unlock(); +} + +void +xfs_cowblocks_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_cowblocks_work); + xfs_icache_free_cowblocks(mp, NULL); + xfs_queue_cowblocks(mp); +} + int xfs_inode_ag_iterator( struct xfs_mount *mp, @@ -1343,18 +1376,30 @@ xfs_inode_free_eofblocks( return ret; } -int -xfs_icache_free_eofblocks( +static int +__xfs_icache_free_eofblocks( struct xfs_mount *mp, - struct xfs_eofblocks *eofb) + struct xfs_eofblocks *eofb, + int (*execute)(struct xfs_inode *ip, int flags, + void *args), + int tag) { int flags = SYNC_TRYLOCK; if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC)) flags = SYNC_WAIT; - return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags, - eofb, XFS_ICI_EOFBLOCKS_TAG); + return xfs_inode_ag_iterator_tag(mp, execute, flags, + eofb, tag); +} + +int +xfs_icache_free_eofblocks( + struct xfs_mount *mp, + struct xfs_eofblocks *eofb) +{ + return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_eofblocks, + XFS_ICI_EOFBLOCKS_TAG); } /* @@ -1363,9 +1408,11 @@ xfs_icache_free_eofblocks( * failure. We make a best effort by including each quota under low free space * conditions (less than 1% free space) in the scan. */ -int -xfs_inode_free_quota_eofblocks( - struct xfs_inode *ip) +static int +__xfs_inode_free_quota_eofblocks( + struct xfs_inode *ip, + int (*execute)(struct xfs_mount *mp, + struct xfs_eofblocks *eofb)) { int scan = 0; struct xfs_eofblocks eofb = {0}; @@ -1401,14 +1448,25 @@ xfs_inode_free_quota_eofblocks( } if (scan) - xfs_icache_free_eofblocks(ip->i_mount, &eofb); + execute(ip->i_mount, &eofb); return scan; } -void -xfs_inode_set_eofblocks_tag( - xfs_inode_t *ip) +int +xfs_inode_free_quota_eofblocks( + struct xfs_inode *ip) +{ + return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks); +} + +static void +__xfs_inode_set_eofblocks_tag( + xfs_inode_t *ip, + void (*execute)(struct xfs_mount *mp), + void (*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, + int error, unsigned long caller_ip), + int tag) { struct xfs_mount *mp = ip->i_mount; struct xfs_perag *pag; @@ -1426,26 +1484,22 @@ xfs_inode_set_eofblocks_tag( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); spin_lock(&pag->pag_ici_lock); - trace_xfs_inode_set_eofblocks_tag(ip); - tagged = radix_tree_tagged(&pag->pag_ici_root, - XFS_ICI_EOFBLOCKS_TAG); + tagged = radix_tree_tagged(&pag->pag_ici_root, tag); radix_tree_tag_set(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), - XFS_ICI_EOFBLOCKS_TAG); + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag); if (!tagged) { /* propagate the eofblocks tag up into the perag radix tree */ spin_lock(&ip->i_mount->m_perag_lock); radix_tree_tag_set(&ip->i_mount->m_perag_tree, XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - XFS_ICI_EOFBLOCKS_TAG); + tag); spin_unlock(&ip->i_mount->m_perag_lock); /* kick off background trimming */ - xfs_queue_eofblocks(ip->i_mount); + execute(ip->i_mount); - trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno, - -1, _RET_IP_); + set_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_); } spin_unlock(&pag->pag_ici_lock); @@ -1453,9 +1507,22 @@ xfs_inode_set_eofblocks_tag( } void -xfs_inode_clear_eofblocks_tag( +xfs_inode_set_eofblocks_tag( xfs_inode_t *ip) { + trace_xfs_inode_set_eofblocks_tag(ip); + return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_eofblocks, + trace_xfs_perag_set_eofblocks, + XFS_ICI_EOFBLOCKS_TAG); +} + +static void +__xfs_inode_clear_eofblocks_tag( + xfs_inode_t *ip, + void (*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, + int error, unsigned long caller_ip), + int tag) +{ struct xfs_mount *mp = ip->i_mount; struct xfs_perag *pag; @@ -1465,23 +1532,141 @@ xfs_inode_clear_eofblocks_tag( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); spin_lock(&pag->pag_ici_lock); - trace_xfs_inode_clear_eofblocks_tag(ip); radix_tree_tag_clear(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), - XFS_ICI_EOFBLOCKS_TAG); - if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) { + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag); + if (!radix_tree_tagged(&pag->pag_ici_root, tag)) { /* clear the eofblocks tag from the perag radix tree */ spin_lock(&ip->i_mount->m_perag_lock); radix_tree_tag_clear(&ip->i_mount->m_perag_tree, XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - XFS_ICI_EOFBLOCKS_TAG); + tag); spin_unlock(&ip->i_mount->m_perag_lock); - trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno, - -1, _RET_IP_); + clear_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_); } spin_unlock(&pag->pag_ici_lock); xfs_perag_put(pag); } +void +xfs_inode_clear_eofblocks_tag( + xfs_inode_t *ip) +{ + trace_xfs_inode_clear_eofblocks_tag(ip); + return __xfs_inode_clear_eofblocks_tag(ip, + trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG); +} + +/* + * Automatic CoW Reservation Freeing + * + * These functions automatically garbage collect leftover CoW reservations + * that were made on behalf of a cowextsize hint when we start to run out + * of quota or when the reservations sit around for too long. If the file + * has dirty pages or is undergoing writeback, its CoW reservations will + * be retained. + * + * The actual garbage collection piggybacks off the same code that runs + * the speculative EOF preallocation garbage collector. + */ +STATIC int +xfs_inode_free_cowblocks( + struct xfs_inode *ip, + int flags, + void *args) +{ + int ret; + struct xfs_eofblocks *eofb = args; + bool need_iolock = true; + int match; + + ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0)); + + if (!xfs_reflink_has_real_cow_blocks(ip)) { + trace_xfs_inode_free_cowblocks_invalid(ip); + xfs_inode_clear_cowblocks_tag(ip); + return 0; + } + + /* + * If the mapping is dirty or under writeback we cannot touch the + * CoW fork. Leave it alone if we're in the midst of a directio. + */ + if (mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) || + mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) || + atomic_read(&VFS_I(ip)->i_dio_count)) + return 0; + + if (eofb) { + if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) + match = xfs_inode_match_id_union(ip, eofb); + else + match = xfs_inode_match_id(ip, eofb); + if (!match) + return 0; + + /* skip the inode if the file size is too small */ + if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && + XFS_ISIZE(ip) < eofb->eof_min_file_size) + return 0; + + /* + * A scan owner implies we already hold the iolock. Skip it in + * xfs_free_eofblocks() to avoid deadlock. This also eliminates + * the possibility of EAGAIN being returned. + */ + if (eofb->eof_scan_owner == ip->i_ino) + need_iolock = false; + } + + /* Free the CoW blocks */ + if (need_iolock) { + xfs_ilock(ip, XFS_IOLOCK_EXCL); + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + } + + ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF); + + if (need_iolock) { + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + } + + return ret; +} + +int +xfs_icache_free_cowblocks( + struct xfs_mount *mp, + struct xfs_eofblocks *eofb) +{ + return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_cowblocks, + XFS_ICI_COWBLOCKS_TAG); +} + +int +xfs_inode_free_quota_cowblocks( + struct xfs_inode *ip) +{ + return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_cowblocks); +} + +void +xfs_inode_set_cowblocks_tag( + xfs_inode_t *ip) +{ + trace_xfs_inode_set_eofblocks_tag(ip); + return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_cowblocks, + trace_xfs_perag_set_eofblocks, + XFS_ICI_COWBLOCKS_TAG); +} + +void +xfs_inode_clear_cowblocks_tag( + xfs_inode_t *ip) +{ + trace_xfs_inode_clear_eofblocks_tag(ip); + return __xfs_inode_clear_eofblocks_tag(ip, + trace_xfs_perag_clear_eofblocks, XFS_ICI_COWBLOCKS_TAG); +} diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 05bac99bef75..a1e02f4708ab 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -40,6 +40,7 @@ struct xfs_eofblocks { in xfs_inode_ag_iterator */ #define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ #define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */ +#define XFS_ICI_COWBLOCKS_TAG 2 /* inode can have cow blocks to gc */ /* * Flags for xfs_iget() @@ -70,6 +71,12 @@ int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip); void xfs_eofblocks_worker(struct work_struct *); void xfs_queue_eofblocks(struct xfs_mount *); +void xfs_inode_set_cowblocks_tag(struct xfs_inode *ip); +void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip); +int xfs_icache_free_cowblocks(struct xfs_mount *, struct xfs_eofblocks *); +int xfs_inode_free_quota_cowblocks(struct xfs_inode *ip); +void xfs_cowblocks_worker(struct work_struct *); + int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, void *args); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 624e1dfa716b..4e560e6a12c1 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -49,6 +49,7 @@ #include "xfs_trans_priv.h" #include "xfs_log.h" #include "xfs_bmap_btree.h" +#include "xfs_reflink.h" kmem_zone_t *xfs_inode_zone; @@ -77,6 +78,29 @@ xfs_get_extsz_hint( } /* + * Helper function to extract CoW extent size hint from inode. + * Between the extent size hint and the CoW extent size hint, we + * return the greater of the two. If the value is zero (automatic), + * use the default size. + */ +xfs_extlen_t +xfs_get_cowextsz_hint( + struct xfs_inode *ip) +{ + xfs_extlen_t a, b; + + a = 0; + if (ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) + a = ip->i_d.di_cowextsize; + b = xfs_get_extsz_hint(ip); + + a = max(a, b); + if (a == 0) + return XFS_DEFAULT_COWEXTSZ_HINT; + return a; +} + +/* * These two are wrapper routines around the xfs_ilock() routine used to * centralize some grungy code. They are used in places that wish to lock the * inode solely for reading the extents. The reason these places can't just @@ -651,6 +675,8 @@ _xfs_dic2xflags( if (di_flags2 & XFS_DIFLAG2_ANY) { if (di_flags2 & XFS_DIFLAG2_DAX) flags |= FS_XFLAG_DAX; + if (di_flags2 & XFS_DIFLAG2_COWEXTSIZE) + flags |= FS_XFLAG_COWEXTSIZE; } if (has_attr) @@ -834,6 +860,7 @@ xfs_ialloc( if (ip->i_d.di_version == 3) { inode->i_version = 1; ip->i_d.di_flags2 = 0; + ip->i_d.di_cowextsize = 0; ip->i_d.di_crtime.t_sec = (__int32_t)tv.tv_sec; ip->i_d.di_crtime.t_nsec = (__int32_t)tv.tv_nsec; } @@ -896,6 +923,15 @@ xfs_ialloc( ip->i_d.di_flags |= di_flags; ip->i_d.di_flags2 |= di_flags2; } + if (pip && + (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) && + pip->i_d.di_version == 3 && + ip->i_d.di_version == 3) { + if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) { + ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + ip->i_d.di_cowextsize = pip->i_d.di_cowextsize; + } + } /* FALLTHROUGH */ case S_IFLNK: ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; @@ -1586,6 +1622,20 @@ xfs_itruncate_extents( goto out; } + /* Remove all pending CoW reservations. */ + error = xfs_reflink_cancel_cow_blocks(ip, &tp, first_unmap_block, + last_block); + if (error) + goto out; + + /* + * Clear the reflink flag if we truncated everything. + */ + if (ip->i_d.di_nblocks == 0 && xfs_is_reflink_inode(ip)) { + ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + xfs_inode_clear_cowblocks_tag(ip); + } + /* * Always re-log the inode so that our permanent transaction can keep * on rolling it forward in the log. @@ -1850,6 +1900,7 @@ xfs_inactive( } mp = ip->i_mount; + ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY)); /* If this is a read-only mount, don't do this (would generate I/O) */ if (mp->m_flags & XFS_MOUNT_RDONLY) diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 8f30d2533b48..f14c1de2549d 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -47,6 +47,7 @@ typedef struct xfs_inode { /* Extent information. */ xfs_ifork_t *i_afp; /* attribute fork pointer */ + xfs_ifork_t *i_cowfp; /* copy on write extents */ xfs_ifork_t i_df; /* data fork */ /* operations vectors */ @@ -65,6 +66,9 @@ typedef struct xfs_inode { struct xfs_icdinode i_d; /* most of ondisk inode */ + xfs_extnum_t i_cnextents; /* # of extents in cow fork */ + unsigned int i_cformat; /* format of cow fork */ + /* VFS inode */ struct inode i_vnode; /* embedded VFS inode */ } xfs_inode_t; @@ -202,6 +206,11 @@ xfs_get_initial_prid(struct xfs_inode *dp) return XFS_PROJID_DEFAULT; } +static inline bool xfs_is_reflink_inode(struct xfs_inode *ip) +{ + return ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK; +} + /* * In-core inode flags. */ @@ -217,6 +226,12 @@ xfs_get_initial_prid(struct xfs_inode *dp) #define XFS_IPINNED (1 << __XFS_IPINNED_BIT) #define XFS_IDONTCACHE (1 << 9) /* don't cache the inode long term */ #define XFS_IEOFBLOCKS (1 << 10)/* has the preallocblocks tag set */ +/* + * If this unlinked inode is in the middle of recovery, don't let drop_inode + * truncate and free the inode. This can happen if we iget the inode during + * log recovery to replay a bmap operation on the inode. + */ +#define XFS_IRECOVERY (1 << 11) /* * Per-lifetime flags need to be reset when re-using a reclaimable inode during @@ -411,6 +426,7 @@ int xfs_iflush(struct xfs_inode *, struct xfs_buf **); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); +xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, xfs_nlink_t, xfs_dev_t, prid_t, int, @@ -474,4 +490,7 @@ do { \ extern struct kmem_zone *xfs_inode_zone; +/* The default CoW extent size hint. */ +#define XFS_DEFAULT_COWEXTSZ_HINT 32 + #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 892c2aced207..9610e9c00952 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -368,7 +368,7 @@ xfs_inode_to_log_dinode( to->di_crtime.t_sec = from->di_crtime.t_sec; to->di_crtime.t_nsec = from->di_crtime.t_nsec; to->di_flags2 = from->di_flags2; - + to->di_cowextsize = from->di_cowextsize; to->di_ino = ip->i_ino; to->di_lsn = lsn; memset(to->di_pad2, 0, sizeof(to->di_pad2)); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 0d9021f0551e..c245bed3249b 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -903,6 +903,8 @@ xfs_ioc_fsgetxattr( xfs_ilock(ip, XFS_ILOCK_SHARED); fa.fsx_xflags = xfs_ip2xflags(ip); fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; + fa.fsx_cowextsize = ip->i_d.di_cowextsize << + ip->i_mount->m_sb.sb_blocklog; fa.fsx_projid = xfs_get_projid(ip); if (attr) { @@ -973,12 +975,13 @@ xfs_set_diflags( if (ip->i_d.di_version < 3) return; - di_flags2 = 0; + di_flags2 = (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK); if (xflags & FS_XFLAG_DAX) di_flags2 |= XFS_DIFLAG2_DAX; + if (xflags & FS_XFLAG_COWEXTSIZE) + di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; ip->i_d.di_flags2 = di_flags2; - } STATIC void @@ -1031,6 +1034,14 @@ xfs_ioctl_setattr_xflags( return -EINVAL; } + /* Clear reflink if we are actually able to set the rt flag. */ + if ((fa->fsx_xflags & FS_XFLAG_REALTIME) && xfs_is_reflink_inode(ip)) + ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + + /* Don't allow us to set DAX mode for a reflinked file for now. */ + if ((fa->fsx_xflags & FS_XFLAG_DAX) && xfs_is_reflink_inode(ip)) + return -EINVAL; + /* * Can't modify an immutable/append-only file unless * we have appropriate permission. @@ -1219,6 +1230,56 @@ xfs_ioctl_setattr_check_extsize( return 0; } +/* + * CoW extent size hint validation rules are: + * + * 1. CoW extent size hint can only be set if reflink is enabled on the fs. + * The inode does not have to have any shared blocks, but it must be a v3. + * 2. FS_XFLAG_COWEXTSIZE is only valid for directories and regular files; + * for a directory, the hint is propagated to new files. + * 3. Can be changed on files & directories at any time. + * 4. CoW extsize hint of 0 turns off hints, clears inode flags. + * 5. Extent size must be a multiple of the appropriate block size. + * 6. The extent size hint must be limited to half the AG size to avoid + * alignment extending the extent beyond the limits of the AG. + */ +static int +xfs_ioctl_setattr_check_cowextsize( + struct xfs_inode *ip, + struct fsxattr *fa) +{ + struct xfs_mount *mp = ip->i_mount; + + if (!(fa->fsx_xflags & FS_XFLAG_COWEXTSIZE)) + return 0; + + if (!xfs_sb_version_hasreflink(&ip->i_mount->m_sb) || + ip->i_d.di_version != 3) + return -EINVAL; + + if (!S_ISREG(VFS_I(ip)->i_mode) && !S_ISDIR(VFS_I(ip)->i_mode)) + return -EINVAL; + + if (fa->fsx_cowextsize != 0) { + xfs_extlen_t size; + xfs_fsblock_t cowextsize_fsb; + + cowextsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_cowextsize); + if (cowextsize_fsb > MAXEXTLEN) + return -EINVAL; + + size = mp->m_sb.sb_blocksize; + if (cowextsize_fsb > mp->m_sb.sb_agblocks / 2) + return -EINVAL; + + if (fa->fsx_cowextsize % size) + return -EINVAL; + } else + fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; + + return 0; +} + static int xfs_ioctl_setattr_check_projid( struct xfs_inode *ip, @@ -1311,6 +1372,10 @@ xfs_ioctl_setattr( if (code) goto error_trans_cancel; + code = xfs_ioctl_setattr_check_cowextsize(ip, fa); + if (code) + goto error_trans_cancel; + code = xfs_ioctl_setattr_xflags(tp, ip, fa); if (code) goto error_trans_cancel; @@ -1346,6 +1411,12 @@ xfs_ioctl_setattr( ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog; else ip->i_d.di_extsize = 0; + if (ip->i_d.di_version == 3 && + (ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) + ip->i_d.di_cowextsize = fa->fsx_cowextsize >> + mp->m_sb.sb_blocklog; + else + ip->i_d.di_cowextsize = 0; code = xfs_trans_commit(tp); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index c08253e11545..d907eb9f8ef3 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -39,6 +39,7 @@ #include "xfs_quota.h" #include "xfs_dquot_item.h" #include "xfs_dquot.h" +#include "xfs_reflink.h" #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ @@ -70,7 +71,7 @@ xfs_bmbt_to_iomap( iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); } -static xfs_extlen_t +xfs_extlen_t xfs_eof_alignment( struct xfs_inode *ip, xfs_extlen_t extsize) @@ -609,7 +610,7 @@ xfs_file_iomap_begin_delay( } retry: - error = xfs_bmapi_reserve_delalloc(ip, offset_fsb, + error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb, end_fsb - offset_fsb, &got, &prev, &idx, eof); switch (error) { @@ -666,6 +667,7 @@ out_unlock: int xfs_iomap_write_allocate( xfs_inode_t *ip, + int whichfork, xfs_off_t offset, xfs_bmbt_irec_t *imap) { @@ -678,8 +680,12 @@ xfs_iomap_write_allocate( xfs_trans_t *tp; int nimaps; int error = 0; + int flags = 0; int nres; + if (whichfork == XFS_COW_FORK) + flags |= XFS_BMAPI_COWFORK; + /* * Make sure that the dquots are there. */ @@ -773,7 +779,7 @@ xfs_iomap_write_allocate( * pointer that the caller gave to us. */ error = xfs_bmapi_write(tp, ip, map_start_fsb, - count_fsb, 0, &first_block, + count_fsb, flags, &first_block, nres, imap, &nimaps, &dfops); if (error) @@ -955,14 +961,22 @@ xfs_file_iomap_begin( struct xfs_mount *mp = ip->i_mount; struct xfs_bmbt_irec imap; xfs_fileoff_t offset_fsb, end_fsb; + bool shared, trimmed; int nimaps = 1, error = 0; unsigned lockmode; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if ((flags & IOMAP_WRITE) && - !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) { + if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { + error = xfs_reflink_reserve_cow_range(ip, offset, length); + if (error < 0) + return error; + } + + if ((flags & IOMAP_WRITE) && !IS_DAX(inode) && + !xfs_get_extsz_hint(ip)) { + /* Reserve delalloc blocks for regular writeback. */ return xfs_file_iomap_begin_delay(inode, offset, length, flags, iomap); } @@ -976,7 +990,14 @@ xfs_file_iomap_begin( end_fsb = XFS_B_TO_FSB(mp, offset + length); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, - &nimaps, XFS_BMAPI_ENTIRE); + &nimaps, 0); + if (error) { + xfs_iunlock(ip, lockmode); + return error; + } + + /* Trim the mapping to the nearest shared extent boundary. */ + error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed); if (error) { xfs_iunlock(ip, lockmode); return error; @@ -1015,6 +1036,8 @@ xfs_file_iomap_begin( } xfs_bmbt_to_iomap(ip, iomap, &imap); + if (shared) + iomap->flags |= IOMAP_F_SHARED; return 0; } diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 6498be485932..6d45cf01fcff 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -25,12 +25,13 @@ struct xfs_bmbt_irec; int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, struct xfs_bmbt_irec *, int); -int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, +int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t, struct xfs_bmbt_irec *); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t); void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, struct xfs_bmbt_irec *); +xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize); extern struct iomap_ops xfs_iomap_ops; extern struct iomap_ops xfs_xattr_iomap_ops; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index c5da95eb79b8..405a65cd9d6b 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1159,6 +1159,7 @@ xfs_diflags_to_iflags( inode->i_flags |= S_NOATIME; if (S_ISREG(inode->i_mode) && ip->i_mount->m_sb.sb_blocksize == PAGE_SIZE && + !xfs_is_reflink_inode(ip) && (ip->i_mount->m_flags & XFS_MOUNT_DAX || ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)) inode->i_flags |= S_DAX; diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index ce73eb34620d..66e881790c17 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -66,7 +66,7 @@ xfs_bulkstat_one_int( if (!buffer || xfs_internal_inum(mp, ino)) return -EINVAL; - buf = kmem_alloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL); + buf = kmem_zalloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL); if (!buf) return -ENOMEM; @@ -111,6 +111,12 @@ xfs_bulkstat_one_int( buf->bs_aextents = dic->di_anextents; buf->bs_forkoff = XFS_IFORK_BOFF(ip); + if (dic->di_version == 3) { + if (dic->di_flags2 & XFS_DIFLAG2_COWEXTSIZE) + buf->bs_cowextsize = dic->di_cowextsize << + mp->m_sb.sb_blocklog; + } + switch (dic->di_format) { case XFS_DINODE_FMT_DEV: buf->bs_rdev = ip->i_df.if_u2.if_rdev; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index b8d64d520e12..68640fb63a54 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -116,6 +116,7 @@ typedef __u32 xfs_nlink_t; #define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val #define xfs_fstrm_centisecs xfs_params.fstrm_timer.val #define xfs_eofb_secs xfs_params.eofb_timer.val +#define xfs_cowb_secs xfs_params.cowb_timer.val #define current_cpu() (raw_smp_processor_id()) #define current_pid() (current->pid) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 846483d56949..9b3d7c76915d 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -45,6 +45,8 @@ #include "xfs_dir2.h" #include "xfs_rmap_item.h" #include "xfs_buf_item.h" +#include "xfs_refcount_item.h" +#include "xfs_bmap_item.h" #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) @@ -1924,6 +1926,10 @@ xlog_recover_reorder_trans( case XFS_LI_EFI: case XFS_LI_RUI: case XFS_LI_RUD: + case XFS_LI_CUI: + case XFS_LI_CUD: + case XFS_LI_BUI: + case XFS_LI_BUD: trace_xfs_log_recover_item_reorder_tail(log, trans, item, pass); list_move_tail(&item->ri_list, &inode_list); @@ -2242,6 +2248,7 @@ xlog_recover_get_buf_lsn( case XFS_ABTB_MAGIC: case XFS_ABTC_MAGIC: case XFS_RMAP_CRC_MAGIC: + case XFS_REFC_CRC_MAGIC: case XFS_IBT_CRC_MAGIC: case XFS_IBT_MAGIC: { struct xfs_btree_block *btb = blk; @@ -2415,6 +2422,9 @@ xlog_recover_validate_buf_type( case XFS_RMAP_CRC_MAGIC: bp->b_ops = &xfs_rmapbt_buf_ops; break; + case XFS_REFC_CRC_MAGIC: + bp->b_ops = &xfs_refcountbt_buf_ops; + break; default: warnmsg = "Bad btree block magic!"; break; @@ -3547,6 +3557,242 @@ xlog_recover_rud_pass2( } /* + * Copy an CUI format buffer from the given buf, and into the destination + * CUI format structure. The CUI/CUD items were designed not to need any + * special alignment handling. + */ +static int +xfs_cui_copy_format( + struct xfs_log_iovec *buf, + struct xfs_cui_log_format *dst_cui_fmt) +{ + struct xfs_cui_log_format *src_cui_fmt; + uint len; + + src_cui_fmt = buf->i_addr; + len = xfs_cui_log_format_sizeof(src_cui_fmt->cui_nextents); + + if (buf->i_len == len) { + memcpy(dst_cui_fmt, src_cui_fmt, len); + return 0; + } + return -EFSCORRUPTED; +} + +/* + * This routine is called to create an in-core extent refcount update + * item from the cui format structure which was logged on disk. + * It allocates an in-core cui, copies the extents from the format + * structure into it, and adds the cui to the AIL with the given + * LSN. + */ +STATIC int +xlog_recover_cui_pass2( + struct xlog *log, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + int error; + struct xfs_mount *mp = log->l_mp; + struct xfs_cui_log_item *cuip; + struct xfs_cui_log_format *cui_formatp; + + cui_formatp = item->ri_buf[0].i_addr; + + cuip = xfs_cui_init(mp, cui_formatp->cui_nextents); + error = xfs_cui_copy_format(&item->ri_buf[0], &cuip->cui_format); + if (error) { + xfs_cui_item_free(cuip); + return error; + } + atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents); + + spin_lock(&log->l_ailp->xa_lock); + /* + * The CUI has two references. One for the CUD and one for CUI to ensure + * it makes it into the AIL. Insert the CUI into the AIL directly and + * drop the CUI reference. Note that xfs_trans_ail_update() drops the + * AIL lock. + */ + xfs_trans_ail_update(log->l_ailp, &cuip->cui_item, lsn); + xfs_cui_release(cuip); + return 0; +} + + +/* + * This routine is called when an CUD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding CUI if it + * was still in the log. To do this it searches the AIL for the CUI with an id + * equal to that in the CUD format structure. If we find it we drop the CUD + * reference, which removes the CUI from the AIL and frees it. + */ +STATIC int +xlog_recover_cud_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_cud_log_format *cud_formatp; + struct xfs_cui_log_item *cuip = NULL; + struct xfs_log_item *lip; + __uint64_t cui_id; + struct xfs_ail_cursor cur; + struct xfs_ail *ailp = log->l_ailp; + + cud_formatp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) + return -EFSCORRUPTED; + cui_id = cud_formatp->cud_cui_id; + + /* + * Search for the CUI with the id in the CUD format structure in the + * AIL. + */ + spin_lock(&ailp->xa_lock); + lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); + while (lip != NULL) { + if (lip->li_type == XFS_LI_CUI) { + cuip = (struct xfs_cui_log_item *)lip; + if (cuip->cui_format.cui_id == cui_id) { + /* + * Drop the CUD reference to the CUI. This + * removes the CUI from the AIL and frees it. + */ + spin_unlock(&ailp->xa_lock); + xfs_cui_release(cuip); + spin_lock(&ailp->xa_lock); + break; + } + } + lip = xfs_trans_ail_cursor_next(ailp, &cur); + } + + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->xa_lock); + + return 0; +} + +/* + * Copy an BUI format buffer from the given buf, and into the destination + * BUI format structure. The BUI/BUD items were designed not to need any + * special alignment handling. + */ +static int +xfs_bui_copy_format( + struct xfs_log_iovec *buf, + struct xfs_bui_log_format *dst_bui_fmt) +{ + struct xfs_bui_log_format *src_bui_fmt; + uint len; + + src_bui_fmt = buf->i_addr; + len = xfs_bui_log_format_sizeof(src_bui_fmt->bui_nextents); + + if (buf->i_len == len) { + memcpy(dst_bui_fmt, src_bui_fmt, len); + return 0; + } + return -EFSCORRUPTED; +} + +/* + * This routine is called to create an in-core extent bmap update + * item from the bui format structure which was logged on disk. + * It allocates an in-core bui, copies the extents from the format + * structure into it, and adds the bui to the AIL with the given + * LSN. + */ +STATIC int +xlog_recover_bui_pass2( + struct xlog *log, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + int error; + struct xfs_mount *mp = log->l_mp; + struct xfs_bui_log_item *buip; + struct xfs_bui_log_format *bui_formatp; + + bui_formatp = item->ri_buf[0].i_addr; + + if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) + return -EFSCORRUPTED; + buip = xfs_bui_init(mp); + error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format); + if (error) { + xfs_bui_item_free(buip); + return error; + } + atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents); + + spin_lock(&log->l_ailp->xa_lock); + /* + * The RUI has two references. One for the RUD and one for RUI to ensure + * it makes it into the AIL. Insert the RUI into the AIL directly and + * drop the RUI reference. Note that xfs_trans_ail_update() drops the + * AIL lock. + */ + xfs_trans_ail_update(log->l_ailp, &buip->bui_item, lsn); + xfs_bui_release(buip); + return 0; +} + + +/* + * This routine is called when an BUD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding BUI if it + * was still in the log. To do this it searches the AIL for the BUI with an id + * equal to that in the BUD format structure. If we find it we drop the BUD + * reference, which removes the BUI from the AIL and frees it. + */ +STATIC int +xlog_recover_bud_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_bud_log_format *bud_formatp; + struct xfs_bui_log_item *buip = NULL; + struct xfs_log_item *lip; + __uint64_t bui_id; + struct xfs_ail_cursor cur; + struct xfs_ail *ailp = log->l_ailp; + + bud_formatp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) + return -EFSCORRUPTED; + bui_id = bud_formatp->bud_bui_id; + + /* + * Search for the BUI with the id in the BUD format structure in the + * AIL. + */ + spin_lock(&ailp->xa_lock); + lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); + while (lip != NULL) { + if (lip->li_type == XFS_LI_BUI) { + buip = (struct xfs_bui_log_item *)lip; + if (buip->bui_format.bui_id == bui_id) { + /* + * Drop the BUD reference to the BUI. This + * removes the BUI from the AIL and frees it. + */ + spin_unlock(&ailp->xa_lock); + xfs_bui_release(buip); + spin_lock(&ailp->xa_lock); + break; + } + } + lip = xfs_trans_ail_cursor_next(ailp, &cur); + } + + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->xa_lock); + + return 0; +} + +/* * This routine is called when an inode create format structure is found in a * committed transaction in the log. It's purpose is to initialise the inodes * being allocated on disk. This requires us to get inode cluster buffers that @@ -3773,6 +4019,10 @@ xlog_recover_ra_pass2( case XFS_LI_QUOTAOFF: case XFS_LI_RUI: case XFS_LI_RUD: + case XFS_LI_CUI: + case XFS_LI_CUD: + case XFS_LI_BUI: + case XFS_LI_BUD: default: break; } @@ -3798,6 +4048,10 @@ xlog_recover_commit_pass1( case XFS_LI_ICREATE: case XFS_LI_RUI: case XFS_LI_RUD: + case XFS_LI_CUI: + case XFS_LI_CUD: + case XFS_LI_BUI: + case XFS_LI_BUD: /* nothing to do in pass 1 */ return 0; default: @@ -3832,6 +4086,14 @@ xlog_recover_commit_pass2( return xlog_recover_rui_pass2(log, item, trans->r_lsn); case XFS_LI_RUD: return xlog_recover_rud_pass2(log, item); + case XFS_LI_CUI: + return xlog_recover_cui_pass2(log, item, trans->r_lsn); + case XFS_LI_CUD: + return xlog_recover_cud_pass2(log, item); + case XFS_LI_BUI: + return xlog_recover_bui_pass2(log, item, trans->r_lsn); + case XFS_LI_BUD: + return xlog_recover_bud_pass2(log, item); case XFS_LI_DQUOT: return xlog_recover_dquot_pass2(log, buffer_list, item, trans->r_lsn); @@ -4419,12 +4681,94 @@ xlog_recover_cancel_rui( spin_lock(&ailp->xa_lock); } +/* Recover the CUI if necessary. */ +STATIC int +xlog_recover_process_cui( + struct xfs_mount *mp, + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + struct xfs_cui_log_item *cuip; + int error; + + /* + * Skip CUIs that we've already processed. + */ + cuip = container_of(lip, struct xfs_cui_log_item, cui_item); + if (test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags)) + return 0; + + spin_unlock(&ailp->xa_lock); + error = xfs_cui_recover(mp, cuip); + spin_lock(&ailp->xa_lock); + + return error; +} + +/* Release the CUI since we're cancelling everything. */ +STATIC void +xlog_recover_cancel_cui( + struct xfs_mount *mp, + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + struct xfs_cui_log_item *cuip; + + cuip = container_of(lip, struct xfs_cui_log_item, cui_item); + + spin_unlock(&ailp->xa_lock); + xfs_cui_release(cuip); + spin_lock(&ailp->xa_lock); +} + +/* Recover the BUI if necessary. */ +STATIC int +xlog_recover_process_bui( + struct xfs_mount *mp, + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + struct xfs_bui_log_item *buip; + int error; + + /* + * Skip BUIs that we've already processed. + */ + buip = container_of(lip, struct xfs_bui_log_item, bui_item); + if (test_bit(XFS_BUI_RECOVERED, &buip->bui_flags)) + return 0; + + spin_unlock(&ailp->xa_lock); + error = xfs_bui_recover(mp, buip); + spin_lock(&ailp->xa_lock); + + return error; +} + +/* Release the BUI since we're cancelling everything. */ +STATIC void +xlog_recover_cancel_bui( + struct xfs_mount *mp, + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + struct xfs_bui_log_item *buip; + + buip = container_of(lip, struct xfs_bui_log_item, bui_item); + + spin_unlock(&ailp->xa_lock); + xfs_bui_release(buip); + spin_lock(&ailp->xa_lock); +} + /* Is this log item a deferred action intent? */ static inline bool xlog_item_is_intent(struct xfs_log_item *lip) { switch (lip->li_type) { case XFS_LI_EFI: case XFS_LI_RUI: + case XFS_LI_CUI: + case XFS_LI_BUI: return true; default: return false; @@ -4488,6 +4832,12 @@ xlog_recover_process_intents( case XFS_LI_RUI: error = xlog_recover_process_rui(log->l_mp, ailp, lip); break; + case XFS_LI_CUI: + error = xlog_recover_process_cui(log->l_mp, ailp, lip); + break; + case XFS_LI_BUI: + error = xlog_recover_process_bui(log->l_mp, ailp, lip); + break; } if (error) goto out; @@ -4535,6 +4885,12 @@ xlog_recover_cancel_intents( case XFS_LI_RUI: xlog_recover_cancel_rui(log->l_mp, ailp, lip); break; + case XFS_LI_CUI: + xlog_recover_cancel_cui(log->l_mp, ailp, lip); + break; + case XFS_LI_BUI: + xlog_recover_cancel_bui(log->l_mp, ailp, lip); + break; } lip = xfs_trans_ail_cursor_next(ailp, &cur); @@ -4613,6 +4969,7 @@ xlog_recover_process_one_iunlink( if (error) goto fail_iput; + xfs_iflags_clear(ip, XFS_IRECOVERY); ASSERT(VFS_I(ip)->i_nlink == 0); ASSERT(VFS_I(ip)->i_mode != 0); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 56e85a6c85c7..fc7873942bea 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -43,6 +43,8 @@ #include "xfs_icache.h" #include "xfs_sysfs.h" #include "xfs_rmap_btree.h" +#include "xfs_refcount_btree.h" +#include "xfs_reflink.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); @@ -684,6 +686,7 @@ xfs_mountfs( xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK); xfs_ialloc_compute_maxlevels(mp); xfs_rmapbt_compute_maxlevels(mp); + xfs_refcountbt_compute_maxlevels(mp); xfs_set_maxicount(mp); @@ -923,6 +926,15 @@ xfs_mountfs( } /* + * During the second phase of log recovery, we need iget and + * iput to behave like they do for an active filesystem. + * xfs_fs_drop_inode needs to be able to prevent the deletion + * of inodes before we're done replaying log items on those + * inodes. + */ + mp->m_super->s_flags |= MS_ACTIVE; + + /* * Finish recovering the file system. This part needed to be delayed * until after the root and real-time bitmap inodes were consistently * read in. @@ -974,10 +986,28 @@ xfs_mountfs( if (error) xfs_warn(mp, "Unable to allocate reserve blocks. Continuing without reserve pool."); + + /* Recover any CoW blocks that never got remapped. */ + error = xfs_reflink_recover_cow(mp); + if (error) { + xfs_err(mp, + "Error %d recovering leftover CoW allocations.", error); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + goto out_quota; + } + + /* Reserve AG blocks for future btree expansion. */ + error = xfs_fs_reserve_ag_blocks(mp); + if (error && error != -ENOSPC) + goto out_agresv; } return 0; + out_agresv: + xfs_fs_unreserve_ag_blocks(mp); + out_quota: + xfs_qm_unmount_quotas(mp); out_rtunmount: xfs_rtunmount_inodes(mp); out_rele_rip: @@ -1019,7 +1049,9 @@ xfs_unmountfs( int error; cancel_delayed_work_sync(&mp->m_eofblocks_work); + cancel_delayed_work_sync(&mp->m_cowblocks_work); + xfs_fs_unreserve_ag_blocks(mp); xfs_qm_unmount_quotas(mp); xfs_rtunmount_inodes(mp); IRELE(mp->m_rootip); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 041d9493e798..819b80b15bfb 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -124,10 +124,13 @@ typedef struct xfs_mount { uint m_inobt_mnr[2]; /* min inobt btree records */ uint m_rmap_mxr[2]; /* max rmap btree records */ uint m_rmap_mnr[2]; /* min rmap btree records */ + uint m_refc_mxr[2]; /* max refc btree records */ + uint m_refc_mnr[2]; /* min refc btree records */ uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */ uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ uint m_in_maxlevels; /* max inobt btree levels. */ uint m_rmap_maxlevels; /* max rmap btree levels */ + uint m_refc_maxlevels; /* max refcount btree level */ xfs_extlen_t m_ag_prealloc_blocks; /* reserved ag blocks */ uint m_alloc_set_aside; /* space we can't use */ uint m_ag_max_usable; /* max space per AG */ @@ -161,6 +164,8 @@ typedef struct xfs_mount { struct delayed_work m_reclaim_work; /* background inode reclaim */ struct delayed_work m_eofblocks_work; /* background eof blocks trimming */ + struct delayed_work m_cowblocks_work; /* background cow blocks + trimming */ bool m_update_sb; /* sb needs update in mount */ int64_t m_low_space[XFS_LOWSP_MAX]; /* low free space thresholds */ @@ -399,6 +404,9 @@ typedef struct xfs_perag { struct xfs_ag_resv pag_meta_resv; /* Blocks reserved for just AGFL-based metadata. */ struct xfs_ag_resv pag_agfl_resv; + + /* reference count */ + __uint8_t pagf_refcount_level; } xfs_perag_t; static inline struct xfs_ag_resv * diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 69e2986a3776..0c381d71b242 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -49,6 +49,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_dsymlink_hdr, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_key, 4); XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_rec, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_key, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_rec, 12); XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_key, 20); XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_rec, 24); XFS_CHECK_STRUCT_SIZE(struct xfs_timestamp, 8); @@ -56,6 +58,7 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_inobt_ptr_t, 4); + XFS_CHECK_STRUCT_SIZE(xfs_refcount_ptr_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_rmap_ptr_t, 4); /* dir/attr trees */ diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 0f14b2e4bf6c..93a7aafa56d6 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -114,6 +114,13 @@ xfs_fs_map_blocks( return -ENXIO; /* + * The pNFS block layout spec actually supports reflink like + * functionality, but the Linux pNFS server doesn't implement it yet. + */ + if (xfs_is_reflink_inode(ip)) + return -ENXIO; + + /* * Lock out any other I/O before we flush and invalidate the pagecache, * and then hand out a layout to the remote system. This is very * similar to direct I/O, except that the synchronization is much more diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c new file mode 100644 index 000000000000..fe86a668a57e --- /dev/null +++ b/fs/xfs/xfs_refcount_item.c @@ -0,0 +1,539 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_buf_item.h" +#include "xfs_refcount_item.h" +#include "xfs_log.h" +#include "xfs_refcount.h" + + +kmem_zone_t *xfs_cui_zone; +kmem_zone_t *xfs_cud_zone; + +static inline struct xfs_cui_log_item *CUI_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_cui_log_item, cui_item); +} + +void +xfs_cui_item_free( + struct xfs_cui_log_item *cuip) +{ + if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS) + kmem_free(cuip); + else + kmem_zone_free(xfs_cui_zone, cuip); +} + +STATIC void +xfs_cui_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + struct xfs_cui_log_item *cuip = CUI_ITEM(lip); + + *nvecs += 1; + *nbytes += xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given cui log item. We use only 1 iovec, and we point that + * at the cui_log_format structure embedded in the cui item. + * It is at this point that we assert that all of the extent + * slots in the cui item have been filled. + */ +STATIC void +xfs_cui_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_cui_log_item *cuip = CUI_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + ASSERT(atomic_read(&cuip->cui_next_extent) == + cuip->cui_format.cui_nextents); + + cuip->cui_format.cui_type = XFS_LI_CUI; + cuip->cui_format.cui_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUI_FORMAT, &cuip->cui_format, + xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents)); +} + +/* + * Pinning has no meaning for an cui item, so just return. + */ +STATIC void +xfs_cui_item_pin( + struct xfs_log_item *lip) +{ +} + +/* + * The unpin operation is the last place an CUI is manipulated in the log. It is + * either inserted in the AIL or aborted in the event of a log I/O error. In + * either case, the CUI transaction has been successfully committed to make it + * this far. Therefore, we expect whoever committed the CUI to either construct + * and commit the CUD or drop the CUD's reference in the event of error. Simply + * drop the log's CUI reference now that the log is done with it. + */ +STATIC void +xfs_cui_item_unpin( + struct xfs_log_item *lip, + int remove) +{ + struct xfs_cui_log_item *cuip = CUI_ITEM(lip); + + xfs_cui_release(cuip); +} + +/* + * CUI items have no locking or pushing. However, since CUIs are pulled from + * the AIL when their corresponding CUDs are committed to disk, their situation + * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller + * will eventually flush the log. This should help in getting the CUI out of + * the AIL. + */ +STATIC uint +xfs_cui_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + return XFS_ITEM_PINNED; +} + +/* + * The CUI has been either committed or aborted if the transaction has been + * cancelled. If the transaction was cancelled, an CUD isn't going to be + * constructed and thus we free the CUI here directly. + */ +STATIC void +xfs_cui_item_unlock( + struct xfs_log_item *lip) +{ + if (lip->li_flags & XFS_LI_ABORTED) + xfs_cui_item_free(CUI_ITEM(lip)); +} + +/* + * The CUI is logged only once and cannot be moved in the log, so simply return + * the lsn at which it's been logged. + */ +STATIC xfs_lsn_t +xfs_cui_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + return lsn; +} + +/* + * The CUI dependency tracking op doesn't do squat. It can't because + * it doesn't know where the free extent is coming from. The dependency + * tracking has to be handled by the "enclosing" metadata object. For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +STATIC void +xfs_cui_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all cui log items. + */ +static const struct xfs_item_ops xfs_cui_item_ops = { + .iop_size = xfs_cui_item_size, + .iop_format = xfs_cui_item_format, + .iop_pin = xfs_cui_item_pin, + .iop_unpin = xfs_cui_item_unpin, + .iop_unlock = xfs_cui_item_unlock, + .iop_committed = xfs_cui_item_committed, + .iop_push = xfs_cui_item_push, + .iop_committing = xfs_cui_item_committing, +}; + +/* + * Allocate and initialize an cui item with the given number of extents. + */ +struct xfs_cui_log_item * +xfs_cui_init( + struct xfs_mount *mp, + uint nextents) + +{ + struct xfs_cui_log_item *cuip; + + ASSERT(nextents > 0); + if (nextents > XFS_CUI_MAX_FAST_EXTENTS) + cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents), + KM_SLEEP); + else + cuip = kmem_zone_zalloc(xfs_cui_zone, KM_SLEEP); + + xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops); + cuip->cui_format.cui_nextents = nextents; + cuip->cui_format.cui_id = (uintptr_t)(void *)cuip; + atomic_set(&cuip->cui_next_extent, 0); + atomic_set(&cuip->cui_refcount, 2); + + return cuip; +} + +/* + * Freeing the CUI requires that we remove it from the AIL if it has already + * been placed there. However, the CUI may not yet have been placed in the AIL + * when called by xfs_cui_release() from CUD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the CUI. + */ +void +xfs_cui_release( + struct xfs_cui_log_item *cuip) +{ + if (atomic_dec_and_test(&cuip->cui_refcount)) { + xfs_trans_ail_remove(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR); + xfs_cui_item_free(cuip); + } +} + +static inline struct xfs_cud_log_item *CUD_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_cud_log_item, cud_item); +} + +STATIC void +xfs_cud_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 1; + *nbytes += sizeof(struct xfs_cud_log_format); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given cud log item. We use only 1 iovec, and we point that + * at the cud_log_format structure embedded in the cud item. + * It is at this point that we assert that all of the extent + * slots in the cud item have been filled. + */ +STATIC void +xfs_cud_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_cud_log_item *cudp = CUD_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + cudp->cud_format.cud_type = XFS_LI_CUD; + cudp->cud_format.cud_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUD_FORMAT, &cudp->cud_format, + sizeof(struct xfs_cud_log_format)); +} + +/* + * Pinning has no meaning for an cud item, so just return. + */ +STATIC void +xfs_cud_item_pin( + struct xfs_log_item *lip) +{ +} + +/* + * Since pinning has no meaning for an cud item, unpinning does + * not either. + */ +STATIC void +xfs_cud_item_unpin( + struct xfs_log_item *lip, + int remove) +{ +} + +/* + * There isn't much you can do to push on an cud item. It is simply stuck + * waiting for the log to be flushed to disk. + */ +STATIC uint +xfs_cud_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + return XFS_ITEM_PINNED; +} + +/* + * The CUD is either committed or aborted if the transaction is cancelled. If + * the transaction is cancelled, drop our reference to the CUI and free the + * CUD. + */ +STATIC void +xfs_cud_item_unlock( + struct xfs_log_item *lip) +{ + struct xfs_cud_log_item *cudp = CUD_ITEM(lip); + + if (lip->li_flags & XFS_LI_ABORTED) { + xfs_cui_release(cudp->cud_cuip); + kmem_zone_free(xfs_cud_zone, cudp); + } +} + +/* + * When the cud item is committed to disk, all we need to do is delete our + * reference to our partner cui item and then free ourselves. Since we're + * freeing ourselves we must return -1 to keep the transaction code from + * further referencing this item. + */ +STATIC xfs_lsn_t +xfs_cud_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_cud_log_item *cudp = CUD_ITEM(lip); + + /* + * Drop the CUI reference regardless of whether the CUD has been + * aborted. Once the CUD transaction is constructed, it is the sole + * responsibility of the CUD to release the CUI (even if the CUI is + * aborted due to log I/O error). + */ + xfs_cui_release(cudp->cud_cuip); + kmem_zone_free(xfs_cud_zone, cudp); + + return (xfs_lsn_t)-1; +} + +/* + * The CUD dependency tracking op doesn't do squat. It can't because + * it doesn't know where the free extent is coming from. The dependency + * tracking has to be handled by the "enclosing" metadata object. For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +STATIC void +xfs_cud_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all cud log items. + */ +static const struct xfs_item_ops xfs_cud_item_ops = { + .iop_size = xfs_cud_item_size, + .iop_format = xfs_cud_item_format, + .iop_pin = xfs_cud_item_pin, + .iop_unpin = xfs_cud_item_unpin, + .iop_unlock = xfs_cud_item_unlock, + .iop_committed = xfs_cud_item_committed, + .iop_push = xfs_cud_item_push, + .iop_committing = xfs_cud_item_committing, +}; + +/* + * Allocate and initialize an cud item with the given number of extents. + */ +struct xfs_cud_log_item * +xfs_cud_init( + struct xfs_mount *mp, + struct xfs_cui_log_item *cuip) + +{ + struct xfs_cud_log_item *cudp; + + cudp = kmem_zone_zalloc(xfs_cud_zone, KM_SLEEP); + xfs_log_item_init(mp, &cudp->cud_item, XFS_LI_CUD, &xfs_cud_item_ops); + cudp->cud_cuip = cuip; + cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id; + + return cudp; +} + +/* + * Process a refcount update intent item that was recovered from the log. + * We need to update the refcountbt. + */ +int +xfs_cui_recover( + struct xfs_mount *mp, + struct xfs_cui_log_item *cuip) +{ + int i; + int error = 0; + unsigned int refc_type; + struct xfs_phys_extent *refc; + xfs_fsblock_t startblock_fsb; + bool op_ok; + struct xfs_cud_log_item *cudp; + struct xfs_trans *tp; + struct xfs_btree_cur *rcur = NULL; + enum xfs_refcount_intent_type type; + xfs_fsblock_t firstfsb; + xfs_fsblock_t new_fsb; + xfs_extlen_t new_len; + struct xfs_bmbt_irec irec; + struct xfs_defer_ops dfops; + bool requeue_only = false; + + ASSERT(!test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags)); + + /* + * First check the validity of the extents described by the + * CUI. If any are bad, then assume that all are bad and + * just toss the CUI. + */ + for (i = 0; i < cuip->cui_format.cui_nextents; i++) { + refc = &cuip->cui_format.cui_extents[i]; + startblock_fsb = XFS_BB_TO_FSB(mp, + XFS_FSB_TO_DADDR(mp, refc->pe_startblock)); + switch (refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK) { + case XFS_REFCOUNT_INCREASE: + case XFS_REFCOUNT_DECREASE: + case XFS_REFCOUNT_ALLOC_COW: + case XFS_REFCOUNT_FREE_COW: + op_ok = true; + break; + default: + op_ok = false; + break; + } + if (!op_ok || startblock_fsb == 0 || + refc->pe_len == 0 || + startblock_fsb >= mp->m_sb.sb_dblocks || + refc->pe_len >= mp->m_sb.sb_agblocks || + (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS)) { + /* + * This will pull the CUI from the AIL and + * free the memory associated with it. + */ + set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); + xfs_cui_release(cuip); + return -EIO; + } + } + + /* + * Under normal operation, refcount updates are deferred, so we + * wouldn't be adding them directly to a transaction. All + * refcount updates manage reservation usage internally and + * dynamically by deferring work that won't fit in the + * transaction. Normally, any work that needs to be deferred + * gets attached to the same defer_ops that scheduled the + * refcount update. However, we're in log recovery here, so we + * we create our own defer_ops and use that to finish up any + * work that doesn't fit. + */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + if (error) + return error; + cudp = xfs_trans_get_cud(tp, cuip); + + xfs_defer_init(&dfops, &firstfsb); + for (i = 0; i < cuip->cui_format.cui_nextents; i++) { + refc = &cuip->cui_format.cui_extents[i]; + refc_type = refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; + switch (refc_type) { + case XFS_REFCOUNT_INCREASE: + case XFS_REFCOUNT_DECREASE: + case XFS_REFCOUNT_ALLOC_COW: + case XFS_REFCOUNT_FREE_COW: + type = refc_type; + break; + default: + error = -EFSCORRUPTED; + goto abort_error; + } + if (requeue_only) { + new_fsb = refc->pe_startblock; + new_len = refc->pe_len; + } else + error = xfs_trans_log_finish_refcount_update(tp, cudp, + &dfops, type, refc->pe_startblock, refc->pe_len, + &new_fsb, &new_len, &rcur); + if (error) + goto abort_error; + + /* Requeue what we didn't finish. */ + if (new_len > 0) { + irec.br_startblock = new_fsb; + irec.br_blockcount = new_len; + switch (type) { + case XFS_REFCOUNT_INCREASE: + error = xfs_refcount_increase_extent( + tp->t_mountp, &dfops, &irec); + break; + case XFS_REFCOUNT_DECREASE: + error = xfs_refcount_decrease_extent( + tp->t_mountp, &dfops, &irec); + break; + case XFS_REFCOUNT_ALLOC_COW: + error = xfs_refcount_alloc_cow_extent( + tp->t_mountp, &dfops, + irec.br_startblock, + irec.br_blockcount); + break; + case XFS_REFCOUNT_FREE_COW: + error = xfs_refcount_free_cow_extent( + tp->t_mountp, &dfops, + irec.br_startblock, + irec.br_blockcount); + break; + default: + ASSERT(0); + } + if (error) + goto abort_error; + requeue_only = true; + } + } + + xfs_refcount_finish_one_cleanup(tp, rcur, error); + error = xfs_defer_finish(&tp, &dfops, NULL); + if (error) + goto abort_error; + set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); + error = xfs_trans_commit(tp); + return error; + +abort_error: + xfs_refcount_finish_one_cleanup(tp, rcur, error); + xfs_defer_cancel(&dfops); + xfs_trans_cancel(tp); + return error; +} diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h new file mode 100644 index 000000000000..5b74dddfa64b --- /dev/null +++ b/fs/xfs/xfs_refcount_item.h @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_REFCOUNT_ITEM_H__ +#define __XFS_REFCOUNT_ITEM_H__ + +/* + * There are (currently) two pairs of refcount btree redo item types: + * increase and decrease. The log items for these are CUI (refcount + * update intent) and CUD (refcount update done). The redo item type + * is encoded in the flags field of each xfs_map_extent. + * + * *I items should be recorded in the *first* of a series of rolled + * transactions, and the *D items should be recorded in the same + * transaction that records the associated refcountbt updates. + * + * Should the system crash after the commit of the first transaction + * but before the commit of the final transaction in a series, log + * recovery will use the redo information recorded by the intent items + * to replay the refcountbt metadata updates. + */ + +/* kernel only CUI/CUD definitions */ + +struct xfs_mount; +struct kmem_zone; + +/* + * Max number of extents in fast allocation path. + */ +#define XFS_CUI_MAX_FAST_EXTENTS 16 + +/* + * Define CUI flag bits. Manipulated by set/clear/test_bit operators. + */ +#define XFS_CUI_RECOVERED 1 + +/* + * This is the "refcount update intent" log item. It is used to log + * the fact that some reverse mappings need to change. It is used in + * conjunction with the "refcount update done" log item described + * below. + * + * These log items follow the same rules as struct xfs_efi_log_item; + * see the comments about that structure (in xfs_extfree_item.h) for + * more details. + */ +struct xfs_cui_log_item { + struct xfs_log_item cui_item; + atomic_t cui_refcount; + atomic_t cui_next_extent; + unsigned long cui_flags; /* misc flags */ + struct xfs_cui_log_format cui_format; +}; + +static inline size_t +xfs_cui_log_item_sizeof( + unsigned int nr) +{ + return offsetof(struct xfs_cui_log_item, cui_format) + + xfs_cui_log_format_sizeof(nr); +} + +/* + * This is the "refcount update done" log item. It is used to log the + * fact that some refcountbt updates mentioned in an earlier cui item + * have been performed. + */ +struct xfs_cud_log_item { + struct xfs_log_item cud_item; + struct xfs_cui_log_item *cud_cuip; + struct xfs_cud_log_format cud_format; +}; + +extern struct kmem_zone *xfs_cui_zone; +extern struct kmem_zone *xfs_cud_zone; + +struct xfs_cui_log_item *xfs_cui_init(struct xfs_mount *, uint); +struct xfs_cud_log_item *xfs_cud_init(struct xfs_mount *, + struct xfs_cui_log_item *); +void xfs_cui_item_free(struct xfs_cui_log_item *); +void xfs_cui_release(struct xfs_cui_log_item *); +int xfs_cui_recover(struct xfs_mount *mp, struct xfs_cui_log_item *cuip); + +#endif /* __XFS_REFCOUNT_ITEM_H__ */ diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c new file mode 100644 index 000000000000..5965e9455d91 --- /dev/null +++ b/fs/xfs/xfs_reflink.c @@ -0,0 +1,1688 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_error.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_ioctl.h" +#include "xfs_trace.h" +#include "xfs_log.h" +#include "xfs_icache.h" +#include "xfs_pnfs.h" +#include "xfs_btree.h" +#include "xfs_refcount_btree.h" +#include "xfs_refcount.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_bit.h" +#include "xfs_alloc.h" +#include "xfs_quota_defs.h" +#include "xfs_quota.h" +#include "xfs_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_reflink.h" +#include "xfs_iomap.h" +#include "xfs_rmap_btree.h" +#include "xfs_sb.h" +#include "xfs_ag_resv.h" + +/* + * Copy on Write of Shared Blocks + * + * XFS must preserve "the usual" file semantics even when two files share + * the same physical blocks. This means that a write to one file must not + * alter the blocks in a different file; the way that we'll do that is + * through the use of a copy-on-write mechanism. At a high level, that + * means that when we want to write to a shared block, we allocate a new + * block, write the data to the new block, and if that succeeds we map the + * new block into the file. + * + * XFS provides a "delayed allocation" mechanism that defers the allocation + * of disk blocks to dirty-but-not-yet-mapped file blocks as long as + * possible. This reduces fragmentation by enabling the filesystem to ask + * for bigger chunks less often, which is exactly what we want for CoW. + * + * The delalloc mechanism begins when the kernel wants to make a block + * writable (write_begin or page_mkwrite). If the offset is not mapped, we + * create a delalloc mapping, which is a regular in-core extent, but without + * a real startblock. (For delalloc mappings, the startblock encodes both + * a flag that this is a delalloc mapping, and a worst-case estimate of how + * many blocks might be required to put the mapping into the BMBT.) delalloc + * mappings are a reservation against the free space in the filesystem; + * adjacent mappings can also be combined into fewer larger mappings. + * + * When dirty pages are being written out (typically in writepage), the + * delalloc reservations are converted into real mappings by allocating + * blocks and replacing the delalloc mapping with real ones. A delalloc + * mapping can be replaced by several real ones if the free space is + * fragmented. + * + * We want to adapt the delalloc mechanism for copy-on-write, since the + * write paths are similar. The first two steps (creating the reservation + * and allocating the blocks) are exactly the same as delalloc except that + * the mappings must be stored in a separate CoW fork because we do not want + * to disturb the mapping in the data fork until we're sure that the write + * succeeded. IO completion in this case is the process of removing the old + * mapping from the data fork and moving the new mapping from the CoW fork to + * the data fork. This will be discussed shortly. + * + * For now, unaligned directio writes will be bounced back to the page cache. + * Block-aligned directio writes will use the same mechanism as buffered + * writes. + * + * CoW remapping must be done after the data block write completes, + * because we don't want to destroy the old data fork map until we're sure + * the new block has been written. Since the new mappings are kept in a + * separate fork, we can simply iterate these mappings to find the ones + * that cover the file blocks that we just CoW'd. For each extent, simply + * unmap the corresponding range in the data fork, map the new range into + * the data fork, and remove the extent from the CoW fork. + * + * Since the remapping operation can be applied to an arbitrary file + * range, we record the need for the remap step as a flag in the ioend + * instead of declaring a new IO type. This is required for direct io + * because we only have ioend for the whole dio, and we have to be able to + * remember the presence of unwritten blocks and CoW blocks with a single + * ioend structure. Better yet, the more ground we can cover with one + * ioend, the better. + */ + +/* + * Given an AG extent, find the lowest-numbered run of shared blocks + * within that range and return the range in fbno/flen. If + * find_end_of_shared is true, return the longest contiguous extent of + * shared blocks. If there are no shared extents, fbno and flen will + * be set to NULLAGBLOCK and 0, respectively. + */ +int +xfs_reflink_find_shared( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + xfs_agblock_t *fbno, + xfs_extlen_t *flen, + bool find_end_of_shared) +{ + struct xfs_buf *agbp; + struct xfs_btree_cur *cur; + int error; + + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + if (error) + return error; + + cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL); + + error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen, + find_end_of_shared); + + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + + xfs_buf_relse(agbp); + return error; +} + +/* + * Trim the mapping to the next block where there's a change in the + * shared/unshared status. More specifically, this means that we + * find the lowest-numbered extent of shared blocks that coincides with + * the given block mapping. If the shared extent overlaps the start of + * the mapping, trim the mapping to the end of the shared extent. If + * the shared region intersects the mapping, trim the mapping to the + * start of the shared extent. If there are no shared regions that + * overlap, just return the original extent. + */ +int +xfs_reflink_trim_around_shared( + struct xfs_inode *ip, + struct xfs_bmbt_irec *irec, + bool *shared, + bool *trimmed) +{ + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_extlen_t aglen; + xfs_agblock_t fbno; + xfs_extlen_t flen; + int error = 0; + + /* Holes, unwritten, and delalloc extents cannot be shared */ + if (!xfs_is_reflink_inode(ip) || + ISUNWRITTEN(irec) || + irec->br_startblock == HOLESTARTBLOCK || + irec->br_startblock == DELAYSTARTBLOCK) { + *shared = false; + return 0; + } + + trace_xfs_reflink_trim_around_shared(ip, irec); + + agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock); + agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock); + aglen = irec->br_blockcount; + + error = xfs_reflink_find_shared(ip->i_mount, agno, agbno, + aglen, &fbno, &flen, true); + if (error) + return error; + + *shared = *trimmed = false; + if (fbno == NULLAGBLOCK) { + /* No shared blocks at all. */ + return 0; + } else if (fbno == agbno) { + /* + * The start of this extent is shared. Truncate the + * mapping at the end of the shared region so that a + * subsequent iteration starts at the start of the + * unshared region. + */ + irec->br_blockcount = flen; + *shared = true; + if (flen != aglen) + *trimmed = true; + return 0; + } else { + /* + * There's a shared extent midway through this extent. + * Truncate the mapping at the start of the shared + * extent so that a subsequent iteration starts at the + * start of the shared region. + */ + irec->br_blockcount = fbno - agbno; + *trimmed = true; + return 0; + } +} + +/* Create a CoW reservation for a range of blocks within a file. */ +static int +__xfs_reflink_reserve_cow( + struct xfs_inode *ip, + xfs_fileoff_t *offset_fsb, + xfs_fileoff_t end_fsb, + bool *skipped) +{ + struct xfs_bmbt_irec got, prev, imap; + xfs_fileoff_t orig_end_fsb; + int nimaps, eof = 0, error = 0; + bool shared = false, trimmed = false; + xfs_extnum_t idx; + xfs_extlen_t align; + + /* Already reserved? Skip the refcount btree access. */ + xfs_bmap_search_extents(ip, *offset_fsb, XFS_COW_FORK, &eof, &idx, + &got, &prev); + if (!eof && got.br_startoff <= *offset_fsb) { + end_fsb = orig_end_fsb = got.br_startoff + got.br_blockcount; + trace_xfs_reflink_cow_found(ip, &got); + goto done; + } + + /* Read extent from the source file. */ + nimaps = 1; + error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb, + &imap, &nimaps, 0); + if (error) + goto out_unlock; + ASSERT(nimaps == 1); + + /* Trim the mapping to the nearest shared extent boundary. */ + error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed); + if (error) + goto out_unlock; + + end_fsb = orig_end_fsb = imap.br_startoff + imap.br_blockcount; + + /* Not shared? Just report the (potentially capped) extent. */ + if (!shared) { + *skipped = true; + goto done; + } + + /* + * Fork all the shared blocks from our write offset until the end of + * the extent. + */ + error = xfs_qm_dqattach_locked(ip, 0); + if (error) + goto out_unlock; + + align = xfs_eof_alignment(ip, xfs_get_cowextsz_hint(ip)); + if (align) + end_fsb = roundup_64(end_fsb, align); + +retry: + error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, *offset_fsb, + end_fsb - *offset_fsb, &got, + &prev, &idx, eof); + switch (error) { + case 0: + break; + case -ENOSPC: + case -EDQUOT: + /* retry without any preallocation */ + trace_xfs_reflink_cow_enospc(ip, &imap); + if (end_fsb != orig_end_fsb) { + end_fsb = orig_end_fsb; + goto retry; + } + /*FALLTHRU*/ + default: + goto out_unlock; + } + + if (end_fsb != orig_end_fsb) + xfs_inode_set_cowblocks_tag(ip); + + trace_xfs_reflink_cow_alloc(ip, &got); +done: + *offset_fsb = end_fsb; +out_unlock: + return error; +} + +/* Create a CoW reservation for part of a file. */ +int +xfs_reflink_reserve_cow_range( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t count) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb, end_fsb; + bool skipped = false; + int error; + + trace_xfs_reflink_reserve_cow_range(ip, offset, count); + + offset_fsb = XFS_B_TO_FSBT(mp, offset); + end_fsb = XFS_B_TO_FSB(mp, offset + count); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + while (offset_fsb < end_fsb) { + error = __xfs_reflink_reserve_cow(ip, &offset_fsb, end_fsb, + &skipped); + if (error) { + trace_xfs_reflink_reserve_cow_range_error(ip, error, + _RET_IP_); + break; + } + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + return error; +} + +/* Allocate all CoW reservations covering a range of blocks in a file. */ +static int +__xfs_reflink_allocate_cow( + struct xfs_inode *ip, + xfs_fileoff_t *offset_fsb, + xfs_fileoff_t end_fsb) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec imap; + struct xfs_defer_ops dfops; + struct xfs_trans *tp; + xfs_fsblock_t first_block; + xfs_fileoff_t next_fsb; + int nimaps = 1, error; + bool skipped = false; + + xfs_defer_init(&dfops, &first_block); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, + XFS_TRANS_RESERVE, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + next_fsb = *offset_fsb; + error = __xfs_reflink_reserve_cow(ip, &next_fsb, end_fsb, &skipped); + if (error) + goto out_trans_cancel; + + if (skipped) { + *offset_fsb = next_fsb; + goto out_trans_cancel; + } + + xfs_trans_ijoin(tp, ip, 0); + error = xfs_bmapi_write(tp, ip, *offset_fsb, next_fsb - *offset_fsb, + XFS_BMAPI_COWFORK, &first_block, + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), + &imap, &nimaps, &dfops); + if (error) + goto out_trans_cancel; + + /* We might not have been able to map the whole delalloc extent */ + *offset_fsb = min(*offset_fsb + imap.br_blockcount, next_fsb); + + error = xfs_defer_finish(&tp, &dfops, NULL); + if (error) + goto out_trans_cancel; + + error = xfs_trans_commit(tp); + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +out_trans_cancel: + xfs_defer_cancel(&dfops); + xfs_trans_cancel(tp); + goto out_unlock; +} + +/* Allocate all CoW reservations covering a part of a file. */ +int +xfs_reflink_allocate_cow_range( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t count) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); + int error; + + ASSERT(xfs_is_reflink_inode(ip)); + + trace_xfs_reflink_allocate_cow_range(ip, offset, count); + + /* + * Make sure that the dquots are there. + */ + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + + while (offset_fsb < end_fsb) { + error = __xfs_reflink_allocate_cow(ip, &offset_fsb, end_fsb); + if (error) { + trace_xfs_reflink_allocate_cow_range_error(ip, error, + _RET_IP_); + break; + } + } + + return error; +} + +/* + * Find the CoW reservation (and whether or not it needs block allocation) + * for a given byte offset of a file. + */ +bool +xfs_reflink_find_cow_mapping( + struct xfs_inode *ip, + xfs_off_t offset, + struct xfs_bmbt_irec *imap, + bool *need_alloc) +{ + struct xfs_bmbt_irec irec; + struct xfs_ifork *ifp; + struct xfs_bmbt_rec_host *gotp; + xfs_fileoff_t bno; + xfs_extnum_t idx; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); + ASSERT(xfs_is_reflink_inode(ip)); + + /* Find the extent in the CoW fork. */ + ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + bno = XFS_B_TO_FSBT(ip->i_mount, offset); + gotp = xfs_iext_bno_to_ext(ifp, bno, &idx); + if (!gotp) + return false; + + xfs_bmbt_get_all(gotp, &irec); + if (bno >= irec.br_startoff + irec.br_blockcount || + bno < irec.br_startoff) + return false; + + trace_xfs_reflink_find_cow_mapping(ip, offset, 1, XFS_IO_OVERWRITE, + &irec); + + /* If it's still delalloc, we must allocate later. */ + *imap = irec; + *need_alloc = !!(isnullstartblock(irec.br_startblock)); + + return true; +} + +/* + * Trim an extent to end at the next CoW reservation past offset_fsb. + */ +int +xfs_reflink_trim_irec_to_next_cow( + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, + struct xfs_bmbt_irec *imap) +{ + struct xfs_bmbt_irec irec; + struct xfs_ifork *ifp; + struct xfs_bmbt_rec_host *gotp; + xfs_extnum_t idx; + + if (!xfs_is_reflink_inode(ip)) + return 0; + + /* Find the extent in the CoW fork. */ + ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + gotp = xfs_iext_bno_to_ext(ifp, offset_fsb, &idx); + if (!gotp) + return 0; + xfs_bmbt_get_all(gotp, &irec); + + /* This is the extent before; try sliding up one. */ + if (irec.br_startoff < offset_fsb) { + idx++; + if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) + return 0; + gotp = xfs_iext_get_ext(ifp, idx); + xfs_bmbt_get_all(gotp, &irec); + } + + if (irec.br_startoff >= imap->br_startoff + imap->br_blockcount) + return 0; + + imap->br_blockcount = irec.br_startoff - imap->br_startoff; + trace_xfs_reflink_trim_irec(ip, imap); + + return 0; +} + +/* + * Cancel all pending CoW reservations for some block range of an inode. + */ +int +xfs_reflink_cancel_cow_blocks( + struct xfs_inode *ip, + struct xfs_trans **tpp, + xfs_fileoff_t offset_fsb, + xfs_fileoff_t end_fsb) +{ + struct xfs_bmbt_irec irec; + xfs_filblks_t count_fsb; + xfs_fsblock_t firstfsb; + struct xfs_defer_ops dfops; + int error = 0; + int nimaps; + + if (!xfs_is_reflink_inode(ip)) + return 0; + + /* Go find the old extent in the CoW fork. */ + while (offset_fsb < end_fsb) { + nimaps = 1; + count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb); + error = xfs_bmapi_read(ip, offset_fsb, count_fsb, &irec, + &nimaps, XFS_BMAPI_COWFORK); + if (error) + break; + ASSERT(nimaps == 1); + + trace_xfs_reflink_cancel_cow(ip, &irec); + + if (irec.br_startblock == DELAYSTARTBLOCK) { + /* Free a delayed allocation. */ + xfs_mod_fdblocks(ip->i_mount, irec.br_blockcount, + false); + ip->i_delayed_blks -= irec.br_blockcount; + + /* Remove the mapping from the CoW fork. */ + error = xfs_bunmapi_cow(ip, &irec); + if (error) + break; + } else if (irec.br_startblock == HOLESTARTBLOCK) { + /* empty */ + } else { + xfs_trans_ijoin(*tpp, ip, 0); + xfs_defer_init(&dfops, &firstfsb); + + /* Free the CoW orphan record. */ + error = xfs_refcount_free_cow_extent(ip->i_mount, + &dfops, irec.br_startblock, + irec.br_blockcount); + if (error) + break; + + xfs_bmap_add_free(ip->i_mount, &dfops, + irec.br_startblock, irec.br_blockcount, + NULL); + + /* Update quota accounting */ + xfs_trans_mod_dquot_byino(*tpp, ip, XFS_TRANS_DQ_BCOUNT, + -(long)irec.br_blockcount); + + /* Roll the transaction */ + error = xfs_defer_finish(tpp, &dfops, ip); + if (error) { + xfs_defer_cancel(&dfops); + break; + } + + /* Remove the mapping from the CoW fork. */ + error = xfs_bunmapi_cow(ip, &irec); + if (error) + break; + } + + /* Roll on... */ + offset_fsb = irec.br_startoff + irec.br_blockcount; + } + + return error; +} + +/* + * Cancel all pending CoW reservations for some byte range of an inode. + */ +int +xfs_reflink_cancel_cow_range( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t count) +{ + struct xfs_trans *tp; + xfs_fileoff_t offset_fsb; + xfs_fileoff_t end_fsb; + int error; + + trace_xfs_reflink_cancel_cow_range(ip, offset, count); + ASSERT(xfs_is_reflink_inode(ip)); + + offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); + if (count == NULLFILEOFF) + end_fsb = NULLFILEOFF; + else + end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); + + /* Start a rolling transaction to remove the mappings */ + error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, + 0, 0, 0, &tp); + if (error) + goto out; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + /* Scrape out the old CoW reservations */ + error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb); + if (error) + goto out_cancel; + + error = xfs_trans_commit(tp); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + +out_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out: + trace_xfs_reflink_cancel_cow_range_error(ip, error, _RET_IP_); + return error; +} + +/* + * Remap parts of a file's data fork after a successful CoW. + */ +int +xfs_reflink_end_cow( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t count) +{ + struct xfs_bmbt_irec irec; + struct xfs_bmbt_irec uirec; + struct xfs_trans *tp; + xfs_fileoff_t offset_fsb; + xfs_fileoff_t end_fsb; + xfs_filblks_t count_fsb; + xfs_fsblock_t firstfsb; + struct xfs_defer_ops dfops; + int error; + unsigned int resblks; + xfs_filblks_t ilen; + xfs_filblks_t rlen; + int nimaps; + + trace_xfs_reflink_end_cow(ip, offset, count); + + offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); + end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); + count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb); + + /* Start a rolling transaction to switch the mappings */ + resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); + error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, + resblks, 0, 0, &tp); + if (error) + goto out; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + /* Go find the old extent in the CoW fork. */ + while (offset_fsb < end_fsb) { + /* Read extent from the source file */ + nimaps = 1; + count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb); + error = xfs_bmapi_read(ip, offset_fsb, count_fsb, &irec, + &nimaps, XFS_BMAPI_COWFORK); + if (error) + goto out_cancel; + ASSERT(nimaps == 1); + + ASSERT(irec.br_startblock != DELAYSTARTBLOCK); + trace_xfs_reflink_cow_remap(ip, &irec); + + /* + * We can have a hole in the CoW fork if part of a directio + * write is CoW but part of it isn't. + */ + rlen = ilen = irec.br_blockcount; + if (irec.br_startblock == HOLESTARTBLOCK) + goto next_extent; + + /* Unmap the old blocks in the data fork. */ + while (rlen) { + xfs_defer_init(&dfops, &firstfsb); + error = __xfs_bunmapi(tp, ip, irec.br_startoff, + &rlen, 0, 1, &firstfsb, &dfops); + if (error) + goto out_defer; + + /* + * Trim the extent to whatever got unmapped. + * Remember, bunmapi works backwards. + */ + uirec.br_startblock = irec.br_startblock + rlen; + uirec.br_startoff = irec.br_startoff + rlen; + uirec.br_blockcount = irec.br_blockcount - rlen; + irec.br_blockcount = rlen; + trace_xfs_reflink_cow_remap_piece(ip, &uirec); + + /* Free the CoW orphan record. */ + error = xfs_refcount_free_cow_extent(tp->t_mountp, + &dfops, uirec.br_startblock, + uirec.br_blockcount); + if (error) + goto out_defer; + + /* Map the new blocks into the data fork. */ + error = xfs_bmap_map_extent(tp->t_mountp, &dfops, + ip, &uirec); + if (error) + goto out_defer; + + /* Remove the mapping from the CoW fork. */ + error = xfs_bunmapi_cow(ip, &uirec); + if (error) + goto out_defer; + + error = xfs_defer_finish(&tp, &dfops, ip); + if (error) + goto out_defer; + } + +next_extent: + /* Roll on... */ + offset_fsb = irec.br_startoff + ilen; + } + + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + goto out; + return 0; + +out_defer: + xfs_defer_cancel(&dfops); +out_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out: + trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_); + return error; +} + +/* + * Free leftover CoW reservations that didn't get cleaned out. + */ +int +xfs_reflink_recover_cow( + struct xfs_mount *mp) +{ + xfs_agnumber_t agno; + int error = 0; + + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return 0; + + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + error = xfs_refcount_recover_cow_leftovers(mp, agno); + if (error) + break; + } + + return error; +} + +/* + * Reflinking (Block) Ranges of Two Files Together + * + * First, ensure that the reflink flag is set on both inodes. The flag is an + * optimization to avoid unnecessary refcount btree lookups in the write path. + * + * Now we can iteratively remap the range of extents (and holes) in src to the + * corresponding ranges in dest. Let drange and srange denote the ranges of + * logical blocks in dest and src touched by the reflink operation. + * + * While the length of drange is greater than zero, + * - Read src's bmbt at the start of srange ("imap") + * - If imap doesn't exist, make imap appear to start at the end of srange + * with zero length. + * - If imap starts before srange, advance imap to start at srange. + * - If imap goes beyond srange, truncate imap to end at the end of srange. + * - Punch (imap start - srange start + imap len) blocks from dest at + * offset (drange start). + * - If imap points to a real range of pblks, + * > Increase the refcount of the imap's pblks + * > Map imap's pblks into dest at the offset + * (drange start + imap start - srange start) + * - Advance drange and srange by (imap start - srange start + imap len) + * + * Finally, if the reflink made dest longer, update both the in-core and + * on-disk file sizes. + * + * ASCII Art Demonstration: + * + * Let's say we want to reflink this source file: + * + * ----SSSSSSS-SSSSS----SSSSSS (src file) + * <--------------------> + * + * into this destination file: + * + * --DDDDDDDDDDDDDDDDDDD--DDD (dest file) + * <--------------------> + * '-' means a hole, and 'S' and 'D' are written blocks in the src and dest. + * Observe that the range has different logical offsets in either file. + * + * Consider that the first extent in the source file doesn't line up with our + * reflink range. Unmapping and remapping are separate operations, so we can + * unmap more blocks from the destination file than we remap. + * + * ----SSSSSSS-SSSSS----SSSSSS + * <-------> + * --DDDDD---------DDDDD--DDD + * <-------> + * + * Now remap the source extent into the destination file: + * + * ----SSSSSSS-SSSSS----SSSSSS + * <-------> + * --DDDDD--SSSSSSSDDDDD--DDD + * <-------> + * + * Do likewise with the second hole and extent in our range. Holes in the + * unmap range don't affect our operation. + * + * ----SSSSSSS-SSSSS----SSSSSS + * <----> + * --DDDDD--SSSSSSS-SSSSS-DDD + * <----> + * + * Finally, unmap and remap part of the third extent. This will increase the + * size of the destination file. + * + * ----SSSSSSS-SSSSS----SSSSSS + * <-----> + * --DDDDD--SSSSSSS-SSSSS----SSS + * <-----> + * + * Once we update the destination file's i_size, we're done. + */ + +/* + * Ensure the reflink bit is set in both inodes. + */ +STATIC int +xfs_reflink_set_inode_flag( + struct xfs_inode *src, + struct xfs_inode *dest) +{ + struct xfs_mount *mp = src->i_mount; + int error; + struct xfs_trans *tp; + + if (xfs_is_reflink_inode(src) && xfs_is_reflink_inode(dest)) + return 0; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); + if (error) + goto out_error; + + /* Lock both files against IO */ + if (src->i_ino == dest->i_ino) + xfs_ilock(src, XFS_ILOCK_EXCL); + else + xfs_lock_two_inodes(src, dest, XFS_ILOCK_EXCL); + + if (!xfs_is_reflink_inode(src)) { + trace_xfs_reflink_set_inode_flag(src); + xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL); + src->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; + xfs_trans_log_inode(tp, src, XFS_ILOG_CORE); + xfs_ifork_init_cow(src); + } else + xfs_iunlock(src, XFS_ILOCK_EXCL); + + if (src->i_ino == dest->i_ino) + goto commit_flags; + + if (!xfs_is_reflink_inode(dest)) { + trace_xfs_reflink_set_inode_flag(dest); + xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); + dest->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; + xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); + xfs_ifork_init_cow(dest); + } else + xfs_iunlock(dest, XFS_ILOCK_EXCL); + +commit_flags: + error = xfs_trans_commit(tp); + if (error) + goto out_error; + return error; + +out_error: + trace_xfs_reflink_set_inode_flag_error(dest, error, _RET_IP_); + return error; +} + +/* + * Update destination inode size & cowextsize hint, if necessary. + */ +STATIC int +xfs_reflink_update_dest( + struct xfs_inode *dest, + xfs_off_t newlen, + xfs_extlen_t cowextsize) +{ + struct xfs_mount *mp = dest->i_mount; + struct xfs_trans *tp; + int error; + + if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) + return 0; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); + if (error) + goto out_error; + + xfs_ilock(dest, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); + + if (newlen > i_size_read(VFS_I(dest))) { + trace_xfs_reflink_update_inode_size(dest, newlen); + i_size_write(VFS_I(dest), newlen); + dest->i_d.di_size = newlen; + } + + if (cowextsize) { + dest->i_d.di_cowextsize = cowextsize; + dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + } + + xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); + + error = xfs_trans_commit(tp); + if (error) + goto out_error; + return error; + +out_error: + trace_xfs_reflink_update_inode_size_error(dest, error, _RET_IP_); + return error; +} + +/* + * Do we have enough reserve in this AG to handle a reflink? The refcount + * btree already reserved all the space it needs, but the rmap btree can grow + * infinitely, so we won't allow more reflinks when the AG is down to the + * btree reserves. + */ +static int +xfs_reflink_ag_has_free_space( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + int error = 0; + + if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + return 0; + + pag = xfs_perag_get(mp, agno); + if (xfs_ag_resv_critical(pag, XFS_AG_RESV_AGFL) || + xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA)) + error = -ENOSPC; + xfs_perag_put(pag); + return error; +} + +/* + * Unmap a range of blocks from a file, then map other blocks into the hole. + * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount). + * The extent irec is mapped into dest at irec->br_startoff. + */ +STATIC int +xfs_reflink_remap_extent( + struct xfs_inode *ip, + struct xfs_bmbt_irec *irec, + xfs_fileoff_t destoff, + xfs_off_t new_isize) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + xfs_fsblock_t firstfsb; + unsigned int resblks; + struct xfs_defer_ops dfops; + struct xfs_bmbt_irec uirec; + bool real_extent; + xfs_filblks_t rlen; + xfs_filblks_t unmap_len; + xfs_off_t newlen; + int error; + + unmap_len = irec->br_startoff + irec->br_blockcount - destoff; + trace_xfs_reflink_punch_range(ip, destoff, unmap_len); + + /* Only remap normal extents. */ + real_extent = (irec->br_startblock != HOLESTARTBLOCK && + irec->br_startblock != DELAYSTARTBLOCK && + !ISUNWRITTEN(irec)); + + /* No reflinking if we're low on space */ + if (real_extent) { + error = xfs_reflink_ag_has_free_space(mp, + XFS_FSB_TO_AGNO(mp, irec->br_startblock)); + if (error) + goto out; + } + + /* Start a rolling transaction to switch the mappings */ + resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); + if (error) + goto out; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + /* If we're not just clearing space, then do we have enough quota? */ + if (real_extent) { + error = xfs_trans_reserve_quota_nblks(tp, ip, + irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS); + if (error) + goto out_cancel; + } + + trace_xfs_reflink_remap(ip, irec->br_startoff, + irec->br_blockcount, irec->br_startblock); + + /* Unmap the old blocks in the data fork. */ + rlen = unmap_len; + while (rlen) { + xfs_defer_init(&dfops, &firstfsb); + error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1, + &firstfsb, &dfops); + if (error) + goto out_defer; + + /* + * Trim the extent to whatever got unmapped. + * Remember, bunmapi works backwards. + */ + uirec.br_startblock = irec->br_startblock + rlen; + uirec.br_startoff = irec->br_startoff + rlen; + uirec.br_blockcount = unmap_len - rlen; + unmap_len = rlen; + + /* If this isn't a real mapping, we're done. */ + if (!real_extent || uirec.br_blockcount == 0) + goto next_extent; + + trace_xfs_reflink_remap(ip, uirec.br_startoff, + uirec.br_blockcount, uirec.br_startblock); + + /* Update the refcount tree */ + error = xfs_refcount_increase_extent(mp, &dfops, &uirec); + if (error) + goto out_defer; + + /* Map the new blocks into the data fork. */ + error = xfs_bmap_map_extent(mp, &dfops, ip, &uirec); + if (error) + goto out_defer; + + /* Update quota accounting. */ + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, + uirec.br_blockcount); + + /* Update dest isize if needed. */ + newlen = XFS_FSB_TO_B(mp, + uirec.br_startoff + uirec.br_blockcount); + newlen = min_t(xfs_off_t, newlen, new_isize); + if (newlen > i_size_read(VFS_I(ip))) { + trace_xfs_reflink_update_inode_size(ip, newlen); + i_size_write(VFS_I(ip), newlen); + ip->i_d.di_size = newlen; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + } + +next_extent: + /* Process all the deferred stuff. */ + error = xfs_defer_finish(&tp, &dfops, ip); + if (error) + goto out_defer; + } + + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + goto out; + return 0; + +out_defer: + xfs_defer_cancel(&dfops); +out_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out: + trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_); + return error; +} + +/* + * Iteratively remap one file's extents (and holes) to another's. + */ +STATIC int +xfs_reflink_remap_blocks( + struct xfs_inode *src, + xfs_fileoff_t srcoff, + struct xfs_inode *dest, + xfs_fileoff_t destoff, + xfs_filblks_t len, + xfs_off_t new_isize) +{ + struct xfs_bmbt_irec imap; + int nimaps; + int error = 0; + xfs_filblks_t range_len; + + /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */ + while (len) { + trace_xfs_reflink_remap_blocks_loop(src, srcoff, len, + dest, destoff); + /* Read extent from the source file */ + nimaps = 1; + xfs_ilock(src, XFS_ILOCK_EXCL); + error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0); + xfs_iunlock(src, XFS_ILOCK_EXCL); + if (error) + goto err; + ASSERT(nimaps == 1); + + trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE, + &imap); + + /* Translate imap into the destination file. */ + range_len = imap.br_startoff + imap.br_blockcount - srcoff; + imap.br_startoff += destoff - srcoff; + + /* Clear dest from destoff to the end of imap and map it in. */ + error = xfs_reflink_remap_extent(dest, &imap, destoff, + new_isize); + if (error) + goto err; + + if (fatal_signal_pending(current)) { + error = -EINTR; + goto err; + } + + /* Advance drange/srange */ + srcoff += range_len; + destoff += range_len; + len -= range_len; + } + + return 0; + +err: + trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_); + return error; +} + +/* + * Read a page's worth of file data into the page cache. Return the page + * locked. + */ +static struct page * +xfs_get_page( + struct inode *inode, + xfs_off_t offset) +{ + struct address_space *mapping; + struct page *page; + pgoff_t n; + + n = offset >> PAGE_SHIFT; + mapping = inode->i_mapping; + page = read_mapping_page(mapping, n, NULL); + if (IS_ERR(page)) + return page; + if (!PageUptodate(page)) { + put_page(page); + return ERR_PTR(-EIO); + } + lock_page(page); + return page; +} + +/* + * Compare extents of two files to see if they are the same. + */ +static int +xfs_compare_extents( + struct inode *src, + xfs_off_t srcoff, + struct inode *dest, + xfs_off_t destoff, + xfs_off_t len, + bool *is_same) +{ + xfs_off_t src_poff; + xfs_off_t dest_poff; + void *src_addr; + void *dest_addr; + struct page *src_page; + struct page *dest_page; + xfs_off_t cmp_len; + bool same; + int error; + + error = -EINVAL; + same = true; + while (len) { + src_poff = srcoff & (PAGE_SIZE - 1); + dest_poff = destoff & (PAGE_SIZE - 1); + cmp_len = min(PAGE_SIZE - src_poff, + PAGE_SIZE - dest_poff); + cmp_len = min(cmp_len, len); + ASSERT(cmp_len > 0); + + trace_xfs_reflink_compare_extents(XFS_I(src), srcoff, cmp_len, + XFS_I(dest), destoff); + + src_page = xfs_get_page(src, srcoff); + if (IS_ERR(src_page)) { + error = PTR_ERR(src_page); + goto out_error; + } + dest_page = xfs_get_page(dest, destoff); + if (IS_ERR(dest_page)) { + error = PTR_ERR(dest_page); + unlock_page(src_page); + put_page(src_page); + goto out_error; + } + src_addr = kmap_atomic(src_page); + dest_addr = kmap_atomic(dest_page); + + flush_dcache_page(src_page); + flush_dcache_page(dest_page); + + if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) + same = false; + + kunmap_atomic(dest_addr); + kunmap_atomic(src_addr); + unlock_page(dest_page); + unlock_page(src_page); + put_page(dest_page); + put_page(src_page); + + if (!same) + break; + + srcoff += cmp_len; + destoff += cmp_len; + len -= cmp_len; + } + + *is_same = same; + return 0; + +out_error: + trace_xfs_reflink_compare_extents_error(XFS_I(dest), error, _RET_IP_); + return error; +} + +/* + * Link a range of blocks from one file to another. + */ +int +xfs_reflink_remap_range( + struct xfs_inode *src, + xfs_off_t srcoff, + struct xfs_inode *dest, + xfs_off_t destoff, + xfs_off_t len, + unsigned int flags) +{ + struct xfs_mount *mp = src->i_mount; + xfs_fileoff_t sfsbno, dfsbno; + xfs_filblks_t fsblen; + int error; + xfs_extlen_t cowextsize; + bool is_same; + + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return -EOPNOTSUPP; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + /* Don't reflink realtime inodes */ + if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest)) + return -EINVAL; + + if (flags & ~XFS_REFLINK_ALL) + return -EINVAL; + + trace_xfs_reflink_remap_range(src, srcoff, len, dest, destoff); + + /* Lock both files against IO */ + if (src->i_ino == dest->i_ino) { + xfs_ilock(src, XFS_IOLOCK_EXCL); + xfs_ilock(src, XFS_MMAPLOCK_EXCL); + } else { + xfs_lock_two_inodes(src, dest, XFS_IOLOCK_EXCL); + xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL); + } + + /* + * Check that the extents are the same. + */ + if (flags & XFS_REFLINK_DEDUPE) { + is_same = false; + error = xfs_compare_extents(VFS_I(src), srcoff, VFS_I(dest), + destoff, len, &is_same); + if (error) + goto out_error; + if (!is_same) { + error = -EBADE; + goto out_error; + } + } + + error = xfs_reflink_set_inode_flag(src, dest); + if (error) + goto out_error; + + /* + * Invalidate the page cache so that we can clear any CoW mappings + * in the destination file. + */ + truncate_inode_pages_range(&VFS_I(dest)->i_data, destoff, + PAGE_ALIGN(destoff + len) - 1); + + dfsbno = XFS_B_TO_FSBT(mp, destoff); + sfsbno = XFS_B_TO_FSBT(mp, srcoff); + fsblen = XFS_B_TO_FSB(mp, len); + error = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen, + destoff + len); + if (error) + goto out_error; + + /* + * Carry the cowextsize hint from src to dest if we're sharing the + * entire source file to the entire destination file, the source file + * has a cowextsize hint, and the destination file does not. + */ + cowextsize = 0; + if (srcoff == 0 && len == i_size_read(VFS_I(src)) && + (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) && + destoff == 0 && len >= i_size_read(VFS_I(dest)) && + !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) + cowextsize = src->i_d.di_cowextsize; + + error = xfs_reflink_update_dest(dest, destoff + len, cowextsize); + if (error) + goto out_error; + +out_error: + xfs_iunlock(src, XFS_MMAPLOCK_EXCL); + xfs_iunlock(src, XFS_IOLOCK_EXCL); + if (src->i_ino != dest->i_ino) { + xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); + xfs_iunlock(dest, XFS_IOLOCK_EXCL); + } + if (error) + trace_xfs_reflink_remap_range_error(dest, error, _RET_IP_); + return error; +} + +/* + * The user wants to preemptively CoW all shared blocks in this file, + * which enables us to turn off the reflink flag. Iterate all + * extents which are not prealloc/delalloc to see which ranges are + * mentioned in the refcount tree, then read those blocks into the + * pagecache, dirty them, fsync them back out, and then we can update + * the inode flag. What happens if we run out of memory? :) + */ +STATIC int +xfs_reflink_dirty_extents( + struct xfs_inode *ip, + xfs_fileoff_t fbno, + xfs_filblks_t end, + xfs_off_t isize) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_extlen_t aglen; + xfs_agblock_t rbno; + xfs_extlen_t rlen; + xfs_off_t fpos; + xfs_off_t flen; + struct xfs_bmbt_irec map[2]; + int nmaps; + int error = 0; + + while (end - fbno > 0) { + nmaps = 1; + /* + * Look for extents in the file. Skip holes, delalloc, or + * unwritten extents; they can't be reflinked. + */ + error = xfs_bmapi_read(ip, fbno, end - fbno, map, &nmaps, 0); + if (error) + goto out; + if (nmaps == 0) + break; + if (map[0].br_startblock == HOLESTARTBLOCK || + map[0].br_startblock == DELAYSTARTBLOCK || + ISUNWRITTEN(&map[0])) + goto next; + + map[1] = map[0]; + while (map[1].br_blockcount) { + agno = XFS_FSB_TO_AGNO(mp, map[1].br_startblock); + agbno = XFS_FSB_TO_AGBNO(mp, map[1].br_startblock); + aglen = map[1].br_blockcount; + + error = xfs_reflink_find_shared(mp, agno, agbno, aglen, + &rbno, &rlen, true); + if (error) + goto out; + if (rbno == NULLAGBLOCK) + break; + + /* Dirty the pages */ + xfs_iunlock(ip, XFS_ILOCK_EXCL); + fpos = XFS_FSB_TO_B(mp, map[1].br_startoff + + (rbno - agbno)); + flen = XFS_FSB_TO_B(mp, rlen); + if (fpos + flen > isize) + flen = isize - fpos; + error = iomap_file_dirty(VFS_I(ip), fpos, flen, + &xfs_iomap_ops); + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (error) + goto out; + + map[1].br_blockcount -= (rbno - agbno + rlen); + map[1].br_startoff += (rbno - agbno + rlen); + map[1].br_startblock += (rbno - agbno + rlen); + } + +next: + fbno = map[0].br_startoff + map[0].br_blockcount; + } +out: + return error; +} + +/* Clear the inode reflink flag if there are no shared extents. */ +int +xfs_reflink_clear_inode_flag( + struct xfs_inode *ip, + struct xfs_trans **tpp) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t fbno; + xfs_filblks_t end; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_extlen_t aglen; + xfs_agblock_t rbno; + xfs_extlen_t rlen; + struct xfs_bmbt_irec map; + int nmaps; + int error = 0; + + ASSERT(xfs_is_reflink_inode(ip)); + + fbno = 0; + end = XFS_B_TO_FSB(mp, i_size_read(VFS_I(ip))); + while (end - fbno > 0) { + nmaps = 1; + /* + * Look for extents in the file. Skip holes, delalloc, or + * unwritten extents; they can't be reflinked. + */ + error = xfs_bmapi_read(ip, fbno, end - fbno, &map, &nmaps, 0); + if (error) + return error; + if (nmaps == 0) + break; + if (map.br_startblock == HOLESTARTBLOCK || + map.br_startblock == DELAYSTARTBLOCK || + ISUNWRITTEN(&map)) + goto next; + + agno = XFS_FSB_TO_AGNO(mp, map.br_startblock); + agbno = XFS_FSB_TO_AGBNO(mp, map.br_startblock); + aglen = map.br_blockcount; + + error = xfs_reflink_find_shared(mp, agno, agbno, aglen, + &rbno, &rlen, false); + if (error) + return error; + /* Is there still a shared block here? */ + if (rbno != NULLAGBLOCK) + return 0; +next: + fbno = map.br_startoff + map.br_blockcount; + } + + /* + * We didn't find any shared blocks so turn off the reflink flag. + * First, get rid of any leftover CoW mappings. + */ + error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF); + if (error) + return error; + + /* Clear the inode flag. */ + trace_xfs_reflink_unset_inode_flag(ip); + ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + xfs_inode_clear_cowblocks_tag(ip); + xfs_trans_ijoin(*tpp, ip, 0); + xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); + + return error; +} + +/* + * Clear the inode reflink flag if there are no shared extents and the size + * hasn't changed. + */ +STATIC int +xfs_reflink_try_clear_inode_flag( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error = 0; + + /* Start a rolling transaction to remove the mappings */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + error = xfs_reflink_clear_inode_flag(ip, &tp); + if (error) + goto cancel; + + error = xfs_trans_commit(tp); + if (error) + goto out; + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; +cancel: + xfs_trans_cancel(tp); +out: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +/* + * Pre-COW all shared blocks within a given byte range of a file and turn off + * the reflink flag if we unshare all of the file's blocks. + */ +int +xfs_reflink_unshare( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t fbno; + xfs_filblks_t end; + xfs_off_t isize; + int error; + + if (!xfs_is_reflink_inode(ip)) + return 0; + + trace_xfs_reflink_unshare(ip, offset, len); + + inode_dio_wait(VFS_I(ip)); + + /* Try to CoW the selected ranges */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + fbno = XFS_B_TO_FSBT(mp, offset); + isize = i_size_read(VFS_I(ip)); + end = XFS_B_TO_FSB(mp, offset + len); + error = xfs_reflink_dirty_extents(ip, fbno, end, isize); + if (error) + goto out_unlock; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + /* Wait for the IO to finish */ + error = filemap_write_and_wait(VFS_I(ip)->i_mapping); + if (error) + goto out; + + /* Turn off the reflink flag if possible. */ + error = xfs_reflink_try_clear_inode_flag(ip); + if (error) + goto out; + + return 0; + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out: + trace_xfs_reflink_unshare_error(ip, error, _RET_IP_); + return error; +} + +/* + * Does this inode have any real CoW reservations? + */ +bool +xfs_reflink_has_real_cow_blocks( + struct xfs_inode *ip) +{ + struct xfs_bmbt_irec irec; + struct xfs_ifork *ifp; + struct xfs_bmbt_rec_host *gotp; + xfs_extnum_t idx; + + if (!xfs_is_reflink_inode(ip)) + return false; + + /* Go find the old extent in the CoW fork. */ + ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + gotp = xfs_iext_bno_to_ext(ifp, 0, &idx); + while (gotp) { + xfs_bmbt_get_all(gotp, &irec); + + if (!isnullstartblock(irec.br_startblock)) + return true; + + /* Roll on... */ + idx++; + if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) + break; + gotp = xfs_iext_get_ext(ifp, idx); + } + + return false; +} diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h new file mode 100644 index 000000000000..5dc3c8ac12aa --- /dev/null +++ b/fs/xfs/xfs_reflink.h @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_REFLINK_H +#define __XFS_REFLINK_H 1 + +extern int xfs_reflink_find_shared(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, + xfs_extlen_t *flen, bool find_maximal); +extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, + struct xfs_bmbt_irec *irec, bool *shared, bool *trimmed); + +extern int xfs_reflink_reserve_cow_range(struct xfs_inode *ip, + xfs_off_t offset, xfs_off_t count); +extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip, + xfs_off_t offset, xfs_off_t count); +extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset, + struct xfs_bmbt_irec *imap, bool *need_alloc); +extern int xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap); + +extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip, + struct xfs_trans **tpp, xfs_fileoff_t offset_fsb, + xfs_fileoff_t end_fsb); +extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t count); +extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t count); +extern int xfs_reflink_recover_cow(struct xfs_mount *mp); +#define XFS_REFLINK_DEDUPE 1 /* only reflink if contents match */ +#define XFS_REFLINK_ALL (XFS_REFLINK_DEDUPE) +extern int xfs_reflink_remap_range(struct xfs_inode *src, xfs_off_t srcoff, + struct xfs_inode *dest, xfs_off_t destoff, xfs_off_t len, + unsigned int flags); +extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip, + struct xfs_trans **tpp); +extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t len); + +extern bool xfs_reflink_has_real_cow_blocks(struct xfs_inode *ip); + +#endif /* __XFS_REFLINK_H */ diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 0432a459871c..73c827831551 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -441,8 +441,11 @@ xfs_rui_recover( XFS_FSB_TO_DADDR(mp, rmap->me_startblock)); switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { case XFS_RMAP_EXTENT_MAP: + case XFS_RMAP_EXTENT_MAP_SHARED: case XFS_RMAP_EXTENT_UNMAP: + case XFS_RMAP_EXTENT_UNMAP_SHARED: case XFS_RMAP_EXTENT_CONVERT: + case XFS_RMAP_EXTENT_CONVERT_SHARED: case XFS_RMAP_EXTENT_ALLOC: case XFS_RMAP_EXTENT_FREE: op_ok = true; @@ -481,12 +484,21 @@ xfs_rui_recover( case XFS_RMAP_EXTENT_MAP: type = XFS_RMAP_MAP; break; + case XFS_RMAP_EXTENT_MAP_SHARED: + type = XFS_RMAP_MAP_SHARED; + break; case XFS_RMAP_EXTENT_UNMAP: type = XFS_RMAP_UNMAP; break; + case XFS_RMAP_EXTENT_UNMAP_SHARED: + type = XFS_RMAP_UNMAP_SHARED; + break; case XFS_RMAP_EXTENT_CONVERT: type = XFS_RMAP_CONVERT; break; + case XFS_RMAP_EXTENT_CONVERT_SHARED: + type = XFS_RMAP_CONVERT_SHARED; + break; case XFS_RMAP_EXTENT_ALLOC: type = XFS_RMAP_ALLOC; break; diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 6e812fe0fd43..12d48cd8f8a4 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -62,6 +62,7 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) { "ibt2", XFSSTAT_END_IBT_V2 }, { "fibt2", XFSSTAT_END_FIBT_V2 }, { "rmapbt", XFSSTAT_END_RMAP_V2 }, + { "refcntbt", XFSSTAT_END_REFCOUNT }, /* we print both series of quota information together */ { "qm", XFSSTAT_END_QM }, }; diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 657865f51e78..79ad2e69fc33 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -213,7 +213,23 @@ struct xfsstats { __uint32_t xs_rmap_2_alloc; __uint32_t xs_rmap_2_free; __uint32_t xs_rmap_2_moves; -#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_RMAP_V2+6) +#define XFSSTAT_END_REFCOUNT (XFSSTAT_END_RMAP_V2 + 15) + __uint32_t xs_refcbt_2_lookup; + __uint32_t xs_refcbt_2_compare; + __uint32_t xs_refcbt_2_insrec; + __uint32_t xs_refcbt_2_delrec; + __uint32_t xs_refcbt_2_newroot; + __uint32_t xs_refcbt_2_killroot; + __uint32_t xs_refcbt_2_increment; + __uint32_t xs_refcbt_2_decrement; + __uint32_t xs_refcbt_2_lshift; + __uint32_t xs_refcbt_2_rshift; + __uint32_t xs_refcbt_2_split; + __uint32_t xs_refcbt_2_join; + __uint32_t xs_refcbt_2_alloc; + __uint32_t xs_refcbt_2_free; + __uint32_t xs_refcbt_2_moves; +#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_REFCOUNT + 6) __uint32_t xs_qm_dqreclaims; __uint32_t xs_qm_dqreclaim_misses; __uint32_t xs_qm_dquot_dups; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 2d092f9577ca..ade4691e3f74 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -47,6 +47,9 @@ #include "xfs_sysfs.h" #include "xfs_ondisk.h" #include "xfs_rmap_item.h" +#include "xfs_refcount_item.h" +#include "xfs_bmap_item.h" +#include "xfs_reflink.h" #include <linux/namei.h> #include <linux/init.h> @@ -936,6 +939,7 @@ xfs_fs_destroy_inode( struct inode *inode) { struct xfs_inode *ip = XFS_I(inode); + int error; trace_xfs_destroy_inode(ip); @@ -943,6 +947,14 @@ xfs_fs_destroy_inode( XFS_STATS_INC(ip->i_mount, vn_rele); XFS_STATS_INC(ip->i_mount, vn_remove); + if (xfs_is_reflink_inode(ip)) { + error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF); + if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) + xfs_warn(ip->i_mount, +"Error %d while evicting CoW blocks for inode %llu.", + error, ip->i_ino); + } + xfs_inactive(ip); ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); @@ -1006,6 +1018,16 @@ xfs_fs_drop_inode( { struct xfs_inode *ip = XFS_I(inode); + /* + * If this unlinked inode is in the middle of recovery, don't + * drop the inode just yet; log recovery will take care of + * that. See the comment for this inode flag. + */ + if (ip->i_flags & XFS_IRECOVERY) { + ASSERT(ip->i_mount->m_log->l_flags & XLOG_RECOVERY_NEEDED); + return 0; + } + return generic_drop_inode(inode) || (ip->i_flags & XFS_IDONTCACHE); } @@ -1296,10 +1318,31 @@ xfs_fs_remount( xfs_restore_resvblks(mp); xfs_log_work_queue(mp); xfs_queue_eofblocks(mp); + + /* Recover any CoW blocks that never got remapped. */ + error = xfs_reflink_recover_cow(mp); + if (error) { + xfs_err(mp, + "Error %d recovering leftover CoW allocations.", error); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; + } + + /* Create the per-AG metadata reservation pool .*/ + error = xfs_fs_reserve_ag_blocks(mp); + if (error && error != -ENOSPC) + return error; } /* rw -> ro */ if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { + /* Free the per-AG metadata reservation pool. */ + error = xfs_fs_unreserve_ag_blocks(mp); + if (error) { + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; + } + /* * Before we sync the metadata, we need to free up the reserve * block pool so that the used block count in the superblock on @@ -1490,6 +1533,7 @@ xfs_fs_fill_super( atomic_set(&mp->m_active_trans, 0); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); + INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker); mp->m_kobj.kobject.kset = xfs_kset; mp->m_super = sb; @@ -1572,6 +1616,9 @@ xfs_fs_fill_super( "DAX unsupported by block device. Turning off DAX."); mp->m_flags &= ~XFS_MOUNT_DAX; } + if (xfs_sb_version_hasreflink(&mp->m_sb)) + xfs_alert(mp, + "DAX and reflink have not been tested together!"); } if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { @@ -1585,6 +1632,10 @@ xfs_fs_fill_super( "EXPERIMENTAL reverse mapping btree feature enabled. Use at your own risk!"); } + if (xfs_sb_version_hasreflink(&mp->m_sb)) + xfs_alert(mp, + "EXPERIMENTAL reflink feature enabled. Use at your own risk!"); + error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; @@ -1788,8 +1839,38 @@ xfs_init_zones(void) if (!xfs_rui_zone) goto out_destroy_rud_zone; + xfs_cud_zone = kmem_zone_init(sizeof(struct xfs_cud_log_item), + "xfs_cud_item"); + if (!xfs_cud_zone) + goto out_destroy_rui_zone; + + xfs_cui_zone = kmem_zone_init( + xfs_cui_log_item_sizeof(XFS_CUI_MAX_FAST_EXTENTS), + "xfs_cui_item"); + if (!xfs_cui_zone) + goto out_destroy_cud_zone; + + xfs_bud_zone = kmem_zone_init(sizeof(struct xfs_bud_log_item), + "xfs_bud_item"); + if (!xfs_bud_zone) + goto out_destroy_cui_zone; + + xfs_bui_zone = kmem_zone_init( + xfs_bui_log_item_sizeof(XFS_BUI_MAX_FAST_EXTENTS), + "xfs_bui_item"); + if (!xfs_bui_zone) + goto out_destroy_bud_zone; + return 0; + out_destroy_bud_zone: + kmem_zone_destroy(xfs_bud_zone); + out_destroy_cui_zone: + kmem_zone_destroy(xfs_cui_zone); + out_destroy_cud_zone: + kmem_zone_destroy(xfs_cud_zone); + out_destroy_rui_zone: + kmem_zone_destroy(xfs_rui_zone); out_destroy_rud_zone: kmem_zone_destroy(xfs_rud_zone); out_destroy_icreate_zone: @@ -1832,6 +1913,10 @@ xfs_destroy_zones(void) * destroy caches. */ rcu_barrier(); + kmem_zone_destroy(xfs_bui_zone); + kmem_zone_destroy(xfs_bud_zone); + kmem_zone_destroy(xfs_cui_zone); + kmem_zone_destroy(xfs_cud_zone); kmem_zone_destroy(xfs_rui_zone); kmem_zone_destroy(xfs_rud_zone); kmem_zone_destroy(xfs_icreate_zone); @@ -1885,6 +1970,8 @@ init_xfs_fs(void) xfs_extent_free_init_defer_op(); xfs_rmap_update_init_defer_op(); + xfs_refcount_update_init_defer_op(); + xfs_bmap_update_init_defer_op(); xfs_dir_startup(); diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index aed74d3f8da9..afe1f66aaa69 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -184,6 +184,15 @@ static struct ctl_table xfs_table[] = { .extra1 = &xfs_params.eofb_timer.min, .extra2 = &xfs_params.eofb_timer.max, }, + { + .procname = "speculative_cow_prealloc_lifetime", + .data = &xfs_params.cowb_timer.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.cowb_timer.min, + .extra2 = &xfs_params.cowb_timer.max, + }, /* please keep this the last entry */ #ifdef CONFIG_PROC_FS { diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index ffef45375754..984a3499cfe3 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -48,6 +48,7 @@ typedef struct xfs_param { xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */ xfs_sysctl_val_t eofb_timer; /* Interval between eofb scan wakeups */ + xfs_sysctl_val_t cowb_timer; /* Interval between cowb scan wakeups */ } xfs_param_t; /* diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 16093c7dacde..ad188d3a83f3 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -39,6 +39,7 @@ struct xfs_buf_log_format; struct xfs_inode_log_format; struct xfs_bmbt_irec; struct xfs_btree_cur; +struct xfs_refcount_irec; DECLARE_EVENT_CLASS(xfs_attr_list_class, TP_PROTO(struct xfs_attr_list_context *ctx), @@ -135,6 +136,8 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks); DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks); +DEFINE_PERAG_REF_EVENT(xfs_perag_set_cowblocks); +DEFINE_PERAG_REF_EVENT(xfs_perag_clear_cowblocks); DECLARE_EVENT_CLASS(xfs_ag_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), @@ -268,10 +271,10 @@ DECLARE_EVENT_CLASS(xfs_bmap_class, __field(unsigned long, caller_ip) ), TP_fast_assign( - struct xfs_ifork *ifp = (state & BMAP_ATTRFORK) ? - ip->i_afp : &ip->i_df; + struct xfs_ifork *ifp; struct xfs_bmbt_irec r; + ifp = xfs_iext_state_to_fork(ip, state); xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r); __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; @@ -686,6 +689,9 @@ DEFINE_INODE_EVENT(xfs_dquot_dqdetach); DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag); DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag); DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid); +DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag); +DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag); +DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid); DEFINE_INODE_EVENT(xfs_filemap_fault); DEFINE_INODE_EVENT(xfs_filemap_pmd_fault); @@ -2581,10 +2587,20 @@ DEFINE_RMAPBT_EVENT(xfs_rmap_delete); DEFINE_AG_ERROR_EVENT(xfs_rmap_insert_error); DEFINE_AG_ERROR_EVENT(xfs_rmap_delete_error); DEFINE_AG_ERROR_EVENT(xfs_rmap_update_error); + +DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_candidate); +DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_query); +DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_candidate); +DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range); DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_result); DEFINE_RMAPBT_EVENT(xfs_rmap_find_right_neighbor_result); DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_result); +/* deferred bmbt updates */ +#define DEFINE_BMAP_DEFERRED_EVENT DEFINE_RMAP_DEFERRED_EVENT +DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_defer); +DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_deferred); + /* per-AG reservation */ DECLARE_EVENT_CLASS(xfs_ag_resv_class, TP_PROTO(struct xfs_perag *pag, enum xfs_ag_resv_type resv, @@ -2639,6 +2655,728 @@ DEFINE_AG_RESV_EVENT(xfs_ag_resv_needed); DEFINE_AG_ERROR_EVENT(xfs_ag_resv_free_error); DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error); +/* refcount tracepoint classes */ + +/* reuse the discard trace class for agbno/aglen-based traces */ +#define DEFINE_AG_EXTENT_EVENT(name) DEFINE_DISCARD_EVENT(name) + +/* ag btree lookup tracepoint class */ +#define XFS_AG_BTREE_CMP_FORMAT_STR \ + { XFS_LOOKUP_EQ, "eq" }, \ + { XFS_LOOKUP_LE, "le" }, \ + { XFS_LOOKUP_GE, "ge" } +DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_lookup_t dir), + TP_ARGS(mp, agno, agbno, dir), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_lookup_t, dir) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->dir = dir; + ), + TP_printk("dev %d:%d agno %u agbno %u cmp %s(%d)\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __print_symbolic(__entry->dir, XFS_AG_BTREE_CMP_FORMAT_STR), + __entry->dir) +) + +#define DEFINE_AG_BTREE_LOOKUP_EVENT(name) \ +DEFINE_EVENT(xfs_ag_btree_lookup_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_lookup_t dir), \ + TP_ARGS(mp, agno, agbno, dir)) + +/* single-rcext tracepoint class */ +DECLARE_EVENT_CLASS(xfs_refcount_extent_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + struct xfs_refcount_irec *irec), + TP_ARGS(mp, agno, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, startblock) + __field(xfs_extlen_t, blockcount) + __field(xfs_nlink_t, refcount) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startblock = irec->rc_startblock; + __entry->blockcount = irec->rc_blockcount; + __entry->refcount = irec->rc_refcount; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startblock, + __entry->blockcount, + __entry->refcount) +) + +#define DEFINE_REFCOUNT_EXTENT_EVENT(name) \ +DEFINE_EVENT(xfs_refcount_extent_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + struct xfs_refcount_irec *irec), \ + TP_ARGS(mp, agno, irec)) + +/* single-rcext and an agbno tracepoint class */ +DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + struct xfs_refcount_irec *irec, xfs_agblock_t agbno), + TP_ARGS(mp, agno, irec, agbno), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, startblock) + __field(xfs_extlen_t, blockcount) + __field(xfs_nlink_t, refcount) + __field(xfs_agblock_t, agbno) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startblock = irec->rc_startblock; + __entry->blockcount = irec->rc_blockcount; + __entry->refcount = irec->rc_refcount; + __entry->agbno = agbno; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u @ agbno %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startblock, + __entry->blockcount, + __entry->refcount, + __entry->agbno) +) + +#define DEFINE_REFCOUNT_EXTENT_AT_EVENT(name) \ +DEFINE_EVENT(xfs_refcount_extent_at_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + struct xfs_refcount_irec *irec, xfs_agblock_t agbno), \ + TP_ARGS(mp, agno, irec, agbno)) + +/* double-rcext tracepoint class */ +DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2), + TP_ARGS(mp, agno, i1, i2), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, i1_startblock) + __field(xfs_extlen_t, i1_blockcount) + __field(xfs_nlink_t, i1_refcount) + __field(xfs_agblock_t, i2_startblock) + __field(xfs_extlen_t, i2_blockcount) + __field(xfs_nlink_t, i2_refcount) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->i1_startblock = i1->rc_startblock; + __entry->i1_blockcount = i1->rc_blockcount; + __entry->i1_refcount = i1->rc_refcount; + __entry->i2_startblock = i2->rc_startblock; + __entry->i2_blockcount = i2->rc_blockcount; + __entry->i2_refcount = i2->rc_refcount; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- " + "agbno %u len %u refcount %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->i1_startblock, + __entry->i1_blockcount, + __entry->i1_refcount, + __entry->i2_startblock, + __entry->i2_blockcount, + __entry->i2_refcount) +) + +#define DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(name) \ +DEFINE_EVENT(xfs_refcount_double_extent_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2), \ + TP_ARGS(mp, agno, i1, i2)) + +/* double-rcext and an agbno tracepoint class */ +DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, + xfs_agblock_t agbno), + TP_ARGS(mp, agno, i1, i2, agbno), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, i1_startblock) + __field(xfs_extlen_t, i1_blockcount) + __field(xfs_nlink_t, i1_refcount) + __field(xfs_agblock_t, i2_startblock) + __field(xfs_extlen_t, i2_blockcount) + __field(xfs_nlink_t, i2_refcount) + __field(xfs_agblock_t, agbno) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->i1_startblock = i1->rc_startblock; + __entry->i1_blockcount = i1->rc_blockcount; + __entry->i1_refcount = i1->rc_refcount; + __entry->i2_startblock = i2->rc_startblock; + __entry->i2_blockcount = i2->rc_blockcount; + __entry->i2_refcount = i2->rc_refcount; + __entry->agbno = agbno; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- " + "agbno %u len %u refcount %u @ agbno %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->i1_startblock, + __entry->i1_blockcount, + __entry->i1_refcount, + __entry->i2_startblock, + __entry->i2_blockcount, + __entry->i2_refcount, + __entry->agbno) +) + +#define DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(name) \ +DEFINE_EVENT(xfs_refcount_double_extent_at_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, \ + xfs_agblock_t agbno), \ + TP_ARGS(mp, agno, i1, i2, agbno)) + +/* triple-rcext tracepoint class */ +DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, + struct xfs_refcount_irec *i3), + TP_ARGS(mp, agno, i1, i2, i3), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, i1_startblock) + __field(xfs_extlen_t, i1_blockcount) + __field(xfs_nlink_t, i1_refcount) + __field(xfs_agblock_t, i2_startblock) + __field(xfs_extlen_t, i2_blockcount) + __field(xfs_nlink_t, i2_refcount) + __field(xfs_agblock_t, i3_startblock) + __field(xfs_extlen_t, i3_blockcount) + __field(xfs_nlink_t, i3_refcount) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->i1_startblock = i1->rc_startblock; + __entry->i1_blockcount = i1->rc_blockcount; + __entry->i1_refcount = i1->rc_refcount; + __entry->i2_startblock = i2->rc_startblock; + __entry->i2_blockcount = i2->rc_blockcount; + __entry->i2_refcount = i2->rc_refcount; + __entry->i3_startblock = i3->rc_startblock; + __entry->i3_blockcount = i3->rc_blockcount; + __entry->i3_refcount = i3->rc_refcount; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- " + "agbno %u len %u refcount %u -- " + "agbno %u len %u refcount %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->i1_startblock, + __entry->i1_blockcount, + __entry->i1_refcount, + __entry->i2_startblock, + __entry->i2_blockcount, + __entry->i2_refcount, + __entry->i3_startblock, + __entry->i3_blockcount, + __entry->i3_refcount) +); + +#define DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT(name) \ +DEFINE_EVENT(xfs_refcount_triple_extent_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, \ + struct xfs_refcount_irec *i3), \ + TP_ARGS(mp, agno, i1, i2, i3)) + +/* refcount btree tracepoints */ +DEFINE_BUSY_EVENT(xfs_refcountbt_alloc_block); +DEFINE_BUSY_EVENT(xfs_refcountbt_free_block); +DEFINE_AG_BTREE_LOOKUP_EVENT(xfs_refcount_lookup); +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_get); +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_update); +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_insert); +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_delete); +DEFINE_AG_ERROR_EVENT(xfs_refcount_insert_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_delete_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_update_error); + +/* refcount adjustment tracepoints */ +DEFINE_AG_EXTENT_EVENT(xfs_refcount_increase); +DEFINE_AG_EXTENT_EVENT(xfs_refcount_decrease); +DEFINE_AG_EXTENT_EVENT(xfs_refcount_cow_increase); +DEFINE_AG_EXTENT_EVENT(xfs_refcount_cow_decrease); +DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT(xfs_refcount_merge_center_extents); +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_modify_extent); +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_recover_extent); +DEFINE_REFCOUNT_EXTENT_AT_EVENT(xfs_refcount_split_extent); +DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_left_extent); +DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_right_extent); +DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(xfs_refcount_find_left_extent); +DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(xfs_refcount_find_right_extent); +DEFINE_AG_ERROR_EVENT(xfs_refcount_adjust_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_adjust_cow_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_center_extents_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_modify_extent_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_split_extent_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_left_extent_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_right_extent_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_find_left_extent_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_find_right_extent_error); + +/* reflink helpers */ +DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared); +DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared_result); +DEFINE_AG_ERROR_EVENT(xfs_refcount_find_shared_error); +#define DEFINE_REFCOUNT_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT +DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_defer); +DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_deferred); + +TRACE_EVENT(xfs_refcount_finish_one_leftover, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + int type, xfs_agblock_t agbno, xfs_extlen_t len, + xfs_agblock_t new_agbno, xfs_extlen_t new_len), + TP_ARGS(mp, agno, type, agbno, len, new_agbno, new_len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(int, type) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(xfs_agblock_t, new_agbno) + __field(xfs_extlen_t, new_len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->type = type; + __entry->agbno = agbno; + __entry->len = len; + __entry->new_agbno = new_agbno; + __entry->new_len = new_len; + ), + TP_printk("dev %d:%d type %d agno %u agbno %u len %u new_agbno %u new_len %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->type, + __entry->agno, + __entry->agbno, + __entry->len, + __entry->new_agbno, + __entry->new_len) +); + +/* simple inode-based error/%ip tracepoint class */ +DECLARE_EVENT_CLASS(xfs_inode_error_class, + TP_PROTO(struct xfs_inode *ip, int error, unsigned long caller_ip), + TP_ARGS(ip, error, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, error) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->error = error; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ino %llx error %d caller %ps", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->error, + (char *)__entry->caller_ip) +); + +#define DEFINE_INODE_ERROR_EVENT(name) \ +DEFINE_EVENT(xfs_inode_error_class, name, \ + TP_PROTO(struct xfs_inode *ip, int error, \ + unsigned long caller_ip), \ + TP_ARGS(ip, error, caller_ip)) + +/* reflink allocator */ +TRACE_EVENT(xfs_bmap_remap_alloc, + TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t fsbno, + xfs_extlen_t len), + TP_ARGS(ip, fsbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsblock_t, fsbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->fsbno = fsbno; + __entry->len = len; + ), + TP_printk("dev %d:%d ino 0x%llx fsbno 0x%llx len %x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->fsbno, + __entry->len) +); +DEFINE_INODE_ERROR_EVENT(xfs_bmap_remap_alloc_error); + +/* reflink tracepoint classes */ + +/* two-file io tracepoint class */ +DECLARE_EVENT_CLASS(xfs_double_io_class, + TP_PROTO(struct xfs_inode *src, xfs_off_t soffset, xfs_off_t len, + struct xfs_inode *dest, xfs_off_t doffset), + TP_ARGS(src, soffset, len, dest, doffset), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, src_ino) + __field(loff_t, src_isize) + __field(loff_t, src_disize) + __field(loff_t, src_offset) + __field(size_t, len) + __field(xfs_ino_t, dest_ino) + __field(loff_t, dest_isize) + __field(loff_t, dest_disize) + __field(loff_t, dest_offset) + ), + TP_fast_assign( + __entry->dev = VFS_I(src)->i_sb->s_dev; + __entry->src_ino = src->i_ino; + __entry->src_isize = VFS_I(src)->i_size; + __entry->src_disize = src->i_d.di_size; + __entry->src_offset = soffset; + __entry->len = len; + __entry->dest_ino = dest->i_ino; + __entry->dest_isize = VFS_I(dest)->i_size; + __entry->dest_disize = dest->i_d.di_size; + __entry->dest_offset = doffset; + ), + TP_printk("dev %d:%d count %zd " + "ino 0x%llx isize 0x%llx disize 0x%llx offset 0x%llx -> " + "ino 0x%llx isize 0x%llx disize 0x%llx offset 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->len, + __entry->src_ino, + __entry->src_isize, + __entry->src_disize, + __entry->src_offset, + __entry->dest_ino, + __entry->dest_isize, + __entry->dest_disize, + __entry->dest_offset) +) + +#define DEFINE_DOUBLE_IO_EVENT(name) \ +DEFINE_EVENT(xfs_double_io_class, name, \ + TP_PROTO(struct xfs_inode *src, xfs_off_t soffset, xfs_off_t len, \ + struct xfs_inode *dest, xfs_off_t doffset), \ + TP_ARGS(src, soffset, len, dest, doffset)) + +/* two-file vfs io tracepoint class */ +DECLARE_EVENT_CLASS(xfs_double_vfs_io_class, + TP_PROTO(struct inode *src, u64 soffset, u64 len, + struct inode *dest, u64 doffset), + TP_ARGS(src, soffset, len, dest, doffset), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, src_ino) + __field(loff_t, src_isize) + __field(loff_t, src_offset) + __field(size_t, len) + __field(unsigned long, dest_ino) + __field(loff_t, dest_isize) + __field(loff_t, dest_offset) + ), + TP_fast_assign( + __entry->dev = src->i_sb->s_dev; + __entry->src_ino = src->i_ino; + __entry->src_isize = i_size_read(src); + __entry->src_offset = soffset; + __entry->len = len; + __entry->dest_ino = dest->i_ino; + __entry->dest_isize = i_size_read(dest); + __entry->dest_offset = doffset; + ), + TP_printk("dev %d:%d count %zd " + "ino 0x%lx isize 0x%llx offset 0x%llx -> " + "ino 0x%lx isize 0x%llx offset 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->len, + __entry->src_ino, + __entry->src_isize, + __entry->src_offset, + __entry->dest_ino, + __entry->dest_isize, + __entry->dest_offset) +) + +#define DEFINE_DOUBLE_VFS_IO_EVENT(name) \ +DEFINE_EVENT(xfs_double_vfs_io_class, name, \ + TP_PROTO(struct inode *src, u64 soffset, u64 len, \ + struct inode *dest, u64 doffset), \ + TP_ARGS(src, soffset, len, dest, doffset)) + +/* CoW write tracepoint */ +DECLARE_EVENT_CLASS(xfs_copy_on_write_class, + TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk, + xfs_extlen_t len, xfs_fsblock_t new_pblk), + TP_ARGS(ip, lblk, pblk, len, new_pblk), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fileoff_t, lblk) + __field(xfs_fsblock_t, pblk) + __field(xfs_extlen_t, len) + __field(xfs_fsblock_t, new_pblk) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->lblk = lblk; + __entry->pblk = pblk; + __entry->len = len; + __entry->new_pblk = new_pblk; + ), + TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx pblk 0x%llx " + "len 0x%x new_pblk %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->lblk, + __entry->pblk, + __entry->len, + __entry->new_pblk) +) + +#define DEFINE_COW_EVENT(name) \ +DEFINE_EVENT(xfs_copy_on_write_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk, \ + xfs_extlen_t len, xfs_fsblock_t new_pblk), \ + TP_ARGS(ip, lblk, pblk, len, new_pblk)) + +/* inode/irec events */ +DECLARE_EVENT_CLASS(xfs_inode_irec_class, + TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec), + TP_ARGS(ip, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fileoff_t, lblk) + __field(xfs_extlen_t, len) + __field(xfs_fsblock_t, pblk) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->lblk = irec->br_startoff; + __entry->len = irec->br_blockcount; + __entry->pblk = irec->br_startblock; + ), + TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->lblk, + __entry->len, + __entry->pblk) +); +#define DEFINE_INODE_IREC_EVENT(name) \ +DEFINE_EVENT(xfs_inode_irec_class, name, \ + TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec), \ + TP_ARGS(ip, irec)) + +/* refcount/reflink tracepoint definitions */ + +/* reflink tracepoints */ +DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag); +DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag); +DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size); +DEFINE_IOMAP_EVENT(xfs_reflink_remap_imap); +TRACE_EVENT(xfs_reflink_remap_blocks_loop, + TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset, + xfs_filblks_t len, struct xfs_inode *dest, + xfs_fileoff_t doffset), + TP_ARGS(src, soffset, len, dest, doffset), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, src_ino) + __field(xfs_fileoff_t, src_lblk) + __field(xfs_filblks_t, len) + __field(xfs_ino_t, dest_ino) + __field(xfs_fileoff_t, dest_lblk) + ), + TP_fast_assign( + __entry->dev = VFS_I(src)->i_sb->s_dev; + __entry->src_ino = src->i_ino; + __entry->src_lblk = soffset; + __entry->len = len; + __entry->dest_ino = dest->i_ino; + __entry->dest_lblk = doffset; + ), + TP_printk("dev %d:%d len 0x%llx " + "ino 0x%llx offset 0x%llx blocks -> " + "ino 0x%llx offset 0x%llx blocks", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->len, + __entry->src_ino, + __entry->src_lblk, + __entry->dest_ino, + __entry->dest_lblk) +); +TRACE_EVENT(xfs_reflink_punch_range, + TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, + xfs_extlen_t len), + TP_ARGS(ip, lblk, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fileoff_t, lblk) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->lblk = lblk; + __entry->len = len; + ), + TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->lblk, + __entry->len) +); +TRACE_EVENT(xfs_reflink_remap, + TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, + xfs_extlen_t len, xfs_fsblock_t new_pblk), + TP_ARGS(ip, lblk, len, new_pblk), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fileoff_t, lblk) + __field(xfs_extlen_t, len) + __field(xfs_fsblock_t, new_pblk) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->lblk = lblk; + __entry->len = len; + __entry->new_pblk = new_pblk; + ), + TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x new_pblk %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->lblk, + __entry->len, + __entry->new_pblk) +); +DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_reflink_main_loop_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_read_iomap_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_blocks_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error); + +/* dedupe tracepoints */ +DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_compare_extents_error); + +/* ioctl tracepoints */ +DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_reflink); +DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_clone_range); +DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_file_extent_same); +TRACE_EVENT(xfs_ioctl_clone, + TP_PROTO(struct inode *src, struct inode *dest), + TP_ARGS(src, dest), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, src_ino) + __field(loff_t, src_isize) + __field(unsigned long, dest_ino) + __field(loff_t, dest_isize) + ), + TP_fast_assign( + __entry->dev = src->i_sb->s_dev; + __entry->src_ino = src->i_ino; + __entry->src_isize = i_size_read(src); + __entry->dest_ino = dest->i_ino; + __entry->dest_isize = i_size_read(dest); + ), + TP_printk("dev %d:%d " + "ino 0x%lx isize 0x%llx -> " + "ino 0x%lx isize 0x%llx\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->src_ino, + __entry->src_isize, + __entry->dest_ino, + __entry->dest_isize) +); + +/* unshare tracepoints */ +DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare); +DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cow_eof_block); +DEFINE_PAGE_EVENT(xfs_reflink_unshare_page); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_cow_eof_block_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_dirty_page_error); + +/* copy on write */ +DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc); + +DEFINE_RW_EVENT(xfs_reflink_reserve_cow_range); +DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range); + +DEFINE_INODE_IREC_EVENT(xfs_reflink_bounce_dio_write); +DEFINE_IOMAP_EVENT(xfs_reflink_find_cow_mapping); +DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_irec); + +DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range); +DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_piece); + +DEFINE_INODE_ERROR_EVENT(xfs_reflink_reserve_cow_range_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_allocate_cow_range_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error); + +DEFINE_COW_EVENT(xfs_reflink_fork_buf); +DEFINE_COW_EVENT(xfs_reflink_finish_fork_buf); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_fork_buf_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_finish_fork_buf_error); + +DEFINE_INODE_EVENT(xfs_reflink_cancel_pending_cow); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cancel_cow); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_pending_cow_error); + +/* rmap swapext tracepoints */ +DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap); +DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap_piece); +DEFINE_INODE_ERROR_EVENT(xfs_swap_extent_rmap_error); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index e2bf86aad33d..61b7fbdd3ebd 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -36,6 +36,11 @@ struct xfs_busy_extent; struct xfs_rud_log_item; struct xfs_rui_log_item; struct xfs_btree_cur; +struct xfs_cui_log_item; +struct xfs_cud_log_item; +struct xfs_defer_ops; +struct xfs_bui_log_item; +struct xfs_bud_log_item; typedef struct xfs_log_item { struct list_head li_ail; /* AIL pointers */ @@ -248,4 +253,28 @@ int xfs_trans_log_finish_rmap_update(struct xfs_trans *tp, xfs_fsblock_t startblock, xfs_filblks_t blockcount, xfs_exntst_t state, struct xfs_btree_cur **pcur); +/* refcount updates */ +enum xfs_refcount_intent_type; + +void xfs_refcount_update_init_defer_op(void); +struct xfs_cud_log_item *xfs_trans_get_cud(struct xfs_trans *tp, + struct xfs_cui_log_item *cuip); +int xfs_trans_log_finish_refcount_update(struct xfs_trans *tp, + struct xfs_cud_log_item *cudp, struct xfs_defer_ops *dfops, + enum xfs_refcount_intent_type type, xfs_fsblock_t startblock, + xfs_extlen_t blockcount, xfs_fsblock_t *new_fsb, + xfs_extlen_t *new_len, struct xfs_btree_cur **pcur); + +/* mapping updates */ +enum xfs_bmap_intent_type; + +void xfs_bmap_update_init_defer_op(void); +struct xfs_bud_log_item *xfs_trans_get_bud(struct xfs_trans *tp, + struct xfs_bui_log_item *buip); +int xfs_trans_log_finish_bmap_update(struct xfs_trans *tp, + struct xfs_bud_log_item *rudp, struct xfs_defer_ops *dfops, + enum xfs_bmap_intent_type type, struct xfs_inode *ip, + int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, + xfs_filblks_t blockcount, xfs_exntst_t state); + #endif /* __XFS_TRANS_H__ */ diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c new file mode 100644 index 000000000000..6408e7d7c08c --- /dev/null +++ b/fs/xfs/xfs_trans_bmap.c @@ -0,0 +1,249 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_bmap_item.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_inode.h" + +/* + * This routine is called to allocate a "bmap update done" + * log item. + */ +struct xfs_bud_log_item * +xfs_trans_get_bud( + struct xfs_trans *tp, + struct xfs_bui_log_item *buip) +{ + struct xfs_bud_log_item *budp; + + budp = xfs_bud_init(tp->t_mountp, buip); + xfs_trans_add_item(tp, &budp->bud_item); + return budp; +} + +/* + * Finish an bmap update and log it to the BUD. Note that the + * transaction is marked dirty regardless of whether the bmap update + * succeeds or fails to support the BUI/BUD lifecycle rules. + */ +int +xfs_trans_log_finish_bmap_update( + struct xfs_trans *tp, + struct xfs_bud_log_item *budp, + struct xfs_defer_ops *dop, + enum xfs_bmap_intent_type type, + struct xfs_inode *ip, + int whichfork, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount, + xfs_exntst_t state) +{ + int error; + + error = xfs_bmap_finish_one(tp, dop, ip, type, whichfork, startoff, + startblock, blockcount, state); + + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: + * + * 1.) releases the BUI and frees the BUD + * 2.) shuts down the filesystem + */ + tp->t_flags |= XFS_TRANS_DIRTY; + budp->bud_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + return error; +} + +/* Sort bmap intents by inode. */ +static int +xfs_bmap_update_diff_items( + void *priv, + struct list_head *a, + struct list_head *b) +{ + struct xfs_bmap_intent *ba; + struct xfs_bmap_intent *bb; + + ba = container_of(a, struct xfs_bmap_intent, bi_list); + bb = container_of(b, struct xfs_bmap_intent, bi_list); + return ba->bi_owner->i_ino - bb->bi_owner->i_ino; +} + +/* Get an BUI. */ +STATIC void * +xfs_bmap_update_create_intent( + struct xfs_trans *tp, + unsigned int count) +{ + struct xfs_bui_log_item *buip; + + ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS); + ASSERT(tp != NULL); + + buip = xfs_bui_init(tp->t_mountp); + ASSERT(buip != NULL); + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &buip->bui_item); + return buip; +} + +/* Set the map extent flags for this mapping. */ +static void +xfs_trans_set_bmap_flags( + struct xfs_map_extent *bmap, + enum xfs_bmap_intent_type type, + int whichfork, + xfs_exntst_t state) +{ + bmap->me_flags = 0; + switch (type) { + case XFS_BMAP_MAP: + case XFS_BMAP_UNMAP: + bmap->me_flags = type; + break; + default: + ASSERT(0); + } + if (state == XFS_EXT_UNWRITTEN) + bmap->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN; + if (whichfork == XFS_ATTR_FORK) + bmap->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK; +} + +/* Log bmap updates in the intent item. */ +STATIC void +xfs_bmap_update_log_item( + struct xfs_trans *tp, + void *intent, + struct list_head *item) +{ + struct xfs_bui_log_item *buip = intent; + struct xfs_bmap_intent *bmap; + uint next_extent; + struct xfs_map_extent *map; + + bmap = container_of(item, struct xfs_bmap_intent, bi_list); + + tp->t_flags |= XFS_TRANS_DIRTY; + buip->bui_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + /* + * atomic_inc_return gives us the value after the increment; + * we want to use it as an array index so we need to subtract 1 from + * it. + */ + next_extent = atomic_inc_return(&buip->bui_next_extent) - 1; + ASSERT(next_extent < buip->bui_format.bui_nextents); + map = &buip->bui_format.bui_extents[next_extent]; + map->me_owner = bmap->bi_owner->i_ino; + map->me_startblock = bmap->bi_bmap.br_startblock; + map->me_startoff = bmap->bi_bmap.br_startoff; + map->me_len = bmap->bi_bmap.br_blockcount; + xfs_trans_set_bmap_flags(map, bmap->bi_type, bmap->bi_whichfork, + bmap->bi_bmap.br_state); +} + +/* Get an BUD so we can process all the deferred rmap updates. */ +STATIC void * +xfs_bmap_update_create_done( + struct xfs_trans *tp, + void *intent, + unsigned int count) +{ + return xfs_trans_get_bud(tp, intent); +} + +/* Process a deferred rmap update. */ +STATIC int +xfs_bmap_update_finish_item( + struct xfs_trans *tp, + struct xfs_defer_ops *dop, + struct list_head *item, + void *done_item, + void **state) +{ + struct xfs_bmap_intent *bmap; + int error; + + bmap = container_of(item, struct xfs_bmap_intent, bi_list); + error = xfs_trans_log_finish_bmap_update(tp, done_item, dop, + bmap->bi_type, + bmap->bi_owner, bmap->bi_whichfork, + bmap->bi_bmap.br_startoff, + bmap->bi_bmap.br_startblock, + bmap->bi_bmap.br_blockcount, + bmap->bi_bmap.br_state); + kmem_free(bmap); + return error; +} + +/* Abort all pending BUIs. */ +STATIC void +xfs_bmap_update_abort_intent( + void *intent) +{ + xfs_bui_release(intent); +} + +/* Cancel a deferred rmap update. */ +STATIC void +xfs_bmap_update_cancel_item( + struct list_head *item) +{ + struct xfs_bmap_intent *bmap; + + bmap = container_of(item, struct xfs_bmap_intent, bi_list); + kmem_free(bmap); +} + +static const struct xfs_defer_op_type xfs_bmap_update_defer_type = { + .type = XFS_DEFER_OPS_TYPE_BMAP, + .max_items = XFS_BUI_MAX_FAST_EXTENTS, + .diff_items = xfs_bmap_update_diff_items, + .create_intent = xfs_bmap_update_create_intent, + .abort_intent = xfs_bmap_update_abort_intent, + .log_item = xfs_bmap_update_log_item, + .create_done = xfs_bmap_update_create_done, + .finish_item = xfs_bmap_update_finish_item, + .cancel_item = xfs_bmap_update_cancel_item, +}; + +/* Register the deferred op type. */ +void +xfs_bmap_update_init_defer_op(void) +{ + xfs_defer_init_op_type(&xfs_bmap_update_defer_type); +} diff --git a/fs/xfs/xfs_trans_refcount.c b/fs/xfs/xfs_trans_refcount.c new file mode 100644 index 000000000000..94c1877af834 --- /dev/null +++ b/fs/xfs/xfs_trans_refcount.c @@ -0,0 +1,264 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_refcount_item.h" +#include "xfs_alloc.h" +#include "xfs_refcount.h" + +/* + * This routine is called to allocate a "refcount update done" + * log item. + */ +struct xfs_cud_log_item * +xfs_trans_get_cud( + struct xfs_trans *tp, + struct xfs_cui_log_item *cuip) +{ + struct xfs_cud_log_item *cudp; + + cudp = xfs_cud_init(tp->t_mountp, cuip); + xfs_trans_add_item(tp, &cudp->cud_item); + return cudp; +} + +/* + * Finish an refcount update and log it to the CUD. Note that the + * transaction is marked dirty regardless of whether the refcount + * update succeeds or fails to support the CUI/CUD lifecycle rules. + */ +int +xfs_trans_log_finish_refcount_update( + struct xfs_trans *tp, + struct xfs_cud_log_item *cudp, + struct xfs_defer_ops *dop, + enum xfs_refcount_intent_type type, + xfs_fsblock_t startblock, + xfs_extlen_t blockcount, + xfs_fsblock_t *new_fsb, + xfs_extlen_t *new_len, + struct xfs_btree_cur **pcur) +{ + int error; + + error = xfs_refcount_finish_one(tp, dop, type, startblock, + blockcount, new_fsb, new_len, pcur); + + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: + * + * 1.) releases the CUI and frees the CUD + * 2.) shuts down the filesystem + */ + tp->t_flags |= XFS_TRANS_DIRTY; + cudp->cud_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + return error; +} + +/* Sort refcount intents by AG. */ +static int +xfs_refcount_update_diff_items( + void *priv, + struct list_head *a, + struct list_head *b) +{ + struct xfs_mount *mp = priv; + struct xfs_refcount_intent *ra; + struct xfs_refcount_intent *rb; + + ra = container_of(a, struct xfs_refcount_intent, ri_list); + rb = container_of(b, struct xfs_refcount_intent, ri_list); + return XFS_FSB_TO_AGNO(mp, ra->ri_startblock) - + XFS_FSB_TO_AGNO(mp, rb->ri_startblock); +} + +/* Get an CUI. */ +STATIC void * +xfs_refcount_update_create_intent( + struct xfs_trans *tp, + unsigned int count) +{ + struct xfs_cui_log_item *cuip; + + ASSERT(tp != NULL); + ASSERT(count > 0); + + cuip = xfs_cui_init(tp->t_mountp, count); + ASSERT(cuip != NULL); + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &cuip->cui_item); + return cuip; +} + +/* Set the phys extent flags for this reverse mapping. */ +static void +xfs_trans_set_refcount_flags( + struct xfs_phys_extent *refc, + enum xfs_refcount_intent_type type) +{ + refc->pe_flags = 0; + switch (type) { + case XFS_REFCOUNT_INCREASE: + case XFS_REFCOUNT_DECREASE: + case XFS_REFCOUNT_ALLOC_COW: + case XFS_REFCOUNT_FREE_COW: + refc->pe_flags |= type; + break; + default: + ASSERT(0); + } +} + +/* Log refcount updates in the intent item. */ +STATIC void +xfs_refcount_update_log_item( + struct xfs_trans *tp, + void *intent, + struct list_head *item) +{ + struct xfs_cui_log_item *cuip = intent; + struct xfs_refcount_intent *refc; + uint next_extent; + struct xfs_phys_extent *ext; + + refc = container_of(item, struct xfs_refcount_intent, ri_list); + + tp->t_flags |= XFS_TRANS_DIRTY; + cuip->cui_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + /* + * atomic_inc_return gives us the value after the increment; + * we want to use it as an array index so we need to subtract 1 from + * it. + */ + next_extent = atomic_inc_return(&cuip->cui_next_extent) - 1; + ASSERT(next_extent < cuip->cui_format.cui_nextents); + ext = &cuip->cui_format.cui_extents[next_extent]; + ext->pe_startblock = refc->ri_startblock; + ext->pe_len = refc->ri_blockcount; + xfs_trans_set_refcount_flags(ext, refc->ri_type); +} + +/* Get an CUD so we can process all the deferred refcount updates. */ +STATIC void * +xfs_refcount_update_create_done( + struct xfs_trans *tp, + void *intent, + unsigned int count) +{ + return xfs_trans_get_cud(tp, intent); +} + +/* Process a deferred refcount update. */ +STATIC int +xfs_refcount_update_finish_item( + struct xfs_trans *tp, + struct xfs_defer_ops *dop, + struct list_head *item, + void *done_item, + void **state) +{ + struct xfs_refcount_intent *refc; + xfs_fsblock_t new_fsb; + xfs_extlen_t new_aglen; + int error; + + refc = container_of(item, struct xfs_refcount_intent, ri_list); + error = xfs_trans_log_finish_refcount_update(tp, done_item, dop, + refc->ri_type, + refc->ri_startblock, + refc->ri_blockcount, + &new_fsb, &new_aglen, + (struct xfs_btree_cur **)state); + /* Did we run out of reservation? Requeue what we didn't finish. */ + if (!error && new_aglen > 0) { + ASSERT(refc->ri_type == XFS_REFCOUNT_INCREASE || + refc->ri_type == XFS_REFCOUNT_DECREASE); + refc->ri_startblock = new_fsb; + refc->ri_blockcount = new_aglen; + return -EAGAIN; + } + kmem_free(refc); + return error; +} + +/* Clean up after processing deferred refcounts. */ +STATIC void +xfs_refcount_update_finish_cleanup( + struct xfs_trans *tp, + void *state, + int error) +{ + struct xfs_btree_cur *rcur = state; + + xfs_refcount_finish_one_cleanup(tp, rcur, error); +} + +/* Abort all pending CUIs. */ +STATIC void +xfs_refcount_update_abort_intent( + void *intent) +{ + xfs_cui_release(intent); +} + +/* Cancel a deferred refcount update. */ +STATIC void +xfs_refcount_update_cancel_item( + struct list_head *item) +{ + struct xfs_refcount_intent *refc; + + refc = container_of(item, struct xfs_refcount_intent, ri_list); + kmem_free(refc); +} + +static const struct xfs_defer_op_type xfs_refcount_update_defer_type = { + .type = XFS_DEFER_OPS_TYPE_REFCOUNT, + .max_items = XFS_CUI_MAX_FAST_EXTENTS, + .diff_items = xfs_refcount_update_diff_items, + .create_intent = xfs_refcount_update_create_intent, + .abort_intent = xfs_refcount_update_abort_intent, + .log_item = xfs_refcount_update_log_item, + .create_done = xfs_refcount_update_create_done, + .finish_item = xfs_refcount_update_finish_item, + .finish_cleanup = xfs_refcount_update_finish_cleanup, + .cancel_item = xfs_refcount_update_cancel_item, +}; + +/* Register the deferred op type. */ +void +xfs_refcount_update_init_defer_op(void) +{ + xfs_defer_init_op_type(&xfs_refcount_update_defer_type); +} diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c index 5a50ef881568..9ead064b5e90 100644 --- a/fs/xfs/xfs_trans_rmap.c +++ b/fs/xfs/xfs_trans_rmap.c @@ -48,12 +48,21 @@ xfs_trans_set_rmap_flags( case XFS_RMAP_MAP: rmap->me_flags |= XFS_RMAP_EXTENT_MAP; break; + case XFS_RMAP_MAP_SHARED: + rmap->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED; + break; case XFS_RMAP_UNMAP: rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP; break; + case XFS_RMAP_UNMAP_SHARED: + rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED; + break; case XFS_RMAP_CONVERT: rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT; break; + case XFS_RMAP_CONVERT_SHARED: + rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED; + break; case XFS_RMAP_ALLOC: rmap->me_flags |= XFS_RMAP_EXTENT_ALLOC; break; |