diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-12-18 05:44:00 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-12-18 05:44:00 +0300 |
commit | 0110c350c86d511be2130cb2a30dcbb76c4af750 (patch) | |
tree | d343a9e0fcb586a7110b13d411b314d33d404c08 /fs | |
parent | d9cb5bfcc3339f1a63df8fe0af8cece33c83c3af (diff) | |
parent | 9763f7a4a5f7b1a7c480fa06d01b2bad25163c0a (diff) | |
download | linux-0110c350c86d511be2130cb2a30dcbb76c4af750.tar.xz |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
Pull more vfs updates from Al Viro:
"In this pile:
- autofs-namespace series
- dedupe stuff
- more struct path constification"
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: (40 commits)
ocfs2: implement the VFS clone_range, copy_range, and dedupe_range features
ocfs2: charge quota for reflinked blocks
ocfs2: fix bad pointer cast
ocfs2: always unlock when completing dio writes
ocfs2: don't eat io errors during _dio_end_io_write
ocfs2: budget for extent tree splits when adding refcount flag
ocfs2: prohibit refcounted swapfiles
ocfs2: add newlines to some error messages
ocfs2: convert inode refcount test to a helper
simple_write_end(): don't zero in short copy into uptodate
exofs: don't mess with simple_write_{begin,end}
9p: saner ->write_end() on failing copy into non-uptodate page
fix gfs2_stuffed_write_end() on short copies
fix ceph_write_end()
nfs_write_end(): fix handling of short copies
vfs: refactor clone/dedupe_file_range common functions
fs: try to clone files first in vfs_copy_file_range
vfs: misc struct path constification
namespace.c: constify struct path passed to a bunch of primitives
quota: constify struct path in quota_on
...
Diffstat (limited to 'fs')
45 files changed, 978 insertions, 468 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 5ca1fb0043f6..adaf6f6dd858 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -310,18 +310,10 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping, p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping); - if (unlikely(copied < len)) { - /* - * zero out the rest of the area - */ - unsigned from = pos & (PAGE_SIZE - 1); - - zero_user(page, from + copied, len - copied); - flush_dcache_page(page); + if (unlikely(copied < len && !PageUptodate(page))) { + copied = 0; + goto out; } - - if (!PageUptodate(page)) - SetPageUptodate(page); /* * No need to use i_size_read() here, the i_size * cannot change under us because we hold the i_mutex. @@ -331,6 +323,7 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping, i_size_write(inode, last_pos); } set_page_dirty(page); +out: unlock_page(page); put_page(page); diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index a1fba4285277..c885daae68c8 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -145,7 +145,7 @@ void autofs4_free_ino(struct autofs_info *); /* Expiration */ int is_autofs4_dentry(struct dentry *); -int autofs4_expire_wait(struct dentry *dentry, int rcu_walk); +int autofs4_expire_wait(const struct path *path, int rcu_walk); int autofs4_expire_run(struct super_block *, struct vfsmount *, struct autofs_sb_info *, struct autofs_packet_expire __user *); @@ -217,7 +217,8 @@ static inline int autofs_prepare_pipe(struct file *pipe) /* Queue management functions */ -int autofs4_wait(struct autofs_sb_info *, struct dentry *, enum autofs_notify); +int autofs4_wait(struct autofs_sb_info *, + const struct path *, enum autofs_notify); int autofs4_wait_release(struct autofs_sb_info *, autofs_wqt_t, int); void autofs4_catatonic_mode(struct autofs_sb_info *); diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index fc09eb77ddf3..6f48d670c941 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -204,7 +204,7 @@ static int autofs_dev_ioctl_protosubver(struct file *fp, /* Find the topmost mount satisfying test() */ static int find_autofs_mount(const char *pathname, struct path *res, - int test(struct path *path, void *data), + int test(const struct path *path, void *data), void *data) { struct path path; @@ -230,12 +230,12 @@ static int find_autofs_mount(const char *pathname, return err; } -static int test_by_dev(struct path *path, void *p) +static int test_by_dev(const struct path *path, void *p) { return path->dentry->d_sb->s_dev == *(dev_t *)p; } -static int test_by_type(struct path *path, void *p) +static int test_by_type(const struct path *path, void *p) { struct autofs_info *ino = autofs4_dentry_ino(path->dentry); @@ -468,7 +468,7 @@ static int autofs_dev_ioctl_requester(struct file *fp, ino = autofs4_dentry_ino(path.dentry); if (ino) { err = 0; - autofs4_expire_wait(path.dentry, 0); + autofs4_expire_wait(&path, 0); spin_lock(&sbi->fs_lock); param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid); @@ -575,7 +575,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, devid = new_encode_dev(dev); - err = have_submounts(path.dentry); + err = path_has_submounts(&path); if (follow_down_one(&path)) magic = path.dentry->d_sb->s_magic; diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index d8e6d421c27f..57725d4a8c59 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -310,26 +310,29 @@ struct dentry *autofs4_expire_direct(struct super_block *sb, now = jiffies; timeout = sbi->exp_timeout; - spin_lock(&sbi->fs_lock); - ino = autofs4_dentry_ino(root); - /* No point expiring a pending mount */ - if (ino->flags & AUTOFS_INF_PENDING) - goto out; if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { + spin_lock(&sbi->fs_lock); + ino = autofs4_dentry_ino(root); + /* No point expiring a pending mount */ + if (ino->flags & AUTOFS_INF_PENDING) { + spin_unlock(&sbi->fs_lock); + goto out; + } ino->flags |= AUTOFS_INF_WANT_EXPIRE; spin_unlock(&sbi->fs_lock); synchronize_rcu(); - spin_lock(&sbi->fs_lock); if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { + spin_lock(&sbi->fs_lock); ino->flags |= AUTOFS_INF_EXPIRING; init_completion(&ino->expire_complete); spin_unlock(&sbi->fs_lock); return root; } + spin_lock(&sbi->fs_lock); ino->flags &= ~AUTOFS_INF_WANT_EXPIRE; + spin_unlock(&sbi->fs_lock); } out: - spin_unlock(&sbi->fs_lock); dput(root); return NULL; @@ -495,8 +498,9 @@ found: return expired; } -int autofs4_expire_wait(struct dentry *dentry, int rcu_walk) +int autofs4_expire_wait(const struct path *path, int rcu_walk) { + struct dentry *dentry = path->dentry; struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); int status; @@ -525,7 +529,7 @@ retry: pr_debug("waiting for expire %p name=%pd\n", dentry, dentry); - status = autofs4_wait(sbi, dentry, NFY_NONE); + status = autofs4_wait(sbi, path, NFY_NONE); wait_for_completion(&ino->expire_complete); pr_debug("expire done status=%d\n", status); @@ -592,11 +596,12 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, if (dentry) { struct autofs_info *ino = autofs4_dentry_ino(dentry); + const struct path path = { .mnt = mnt, .dentry = dentry }; /* This is synchronous because it makes the daemon a * little easier */ - ret = autofs4_wait(sbi, dentry, NFY_EXPIRE); + ret = autofs4_wait(sbi, &path, NFY_EXPIRE); spin_lock(&sbi->fs_lock); /* avoid rapid-fire expire attempts if expiry fails */ diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index a11f73174877..82e8f6edfb48 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -32,7 +32,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file); static struct dentry *autofs4_lookup(struct inode *, struct dentry *, unsigned int); static struct vfsmount *autofs4_d_automount(struct path *); -static int autofs4_d_manage(struct dentry *, bool); +static int autofs4_d_manage(const struct path *, bool); static void autofs4_dentry_release(struct dentry *); const struct file_operations autofs4_root_operations = { @@ -123,7 +123,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file) * it. */ spin_lock(&sbi->lookup_lock); - if (!d_mountpoint(dentry) && simple_empty(dentry)) { + if (!path_is_mountpoint(&file->f_path) && simple_empty(dentry)) { spin_unlock(&sbi->lookup_lock); return -ENOENT; } @@ -269,39 +269,41 @@ next: return NULL; } -static int autofs4_mount_wait(struct dentry *dentry, bool rcu_walk) +static int autofs4_mount_wait(const struct path *path, bool rcu_walk) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs4_sbi(path->dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(path->dentry); int status = 0; if (ino->flags & AUTOFS_INF_PENDING) { if (rcu_walk) return -ECHILD; - pr_debug("waiting for mount name=%pd\n", dentry); - status = autofs4_wait(sbi, dentry, NFY_MOUNT); + pr_debug("waiting for mount name=%pd\n", path->dentry); + status = autofs4_wait(sbi, path, NFY_MOUNT); pr_debug("mount wait done status=%d\n", status); } ino->last_used = jiffies; return status; } -static int do_expire_wait(struct dentry *dentry, bool rcu_walk) +static int do_expire_wait(const struct path *path, bool rcu_walk) { + struct dentry *dentry = path->dentry; struct dentry *expiring; expiring = autofs4_lookup_expiring(dentry, rcu_walk); if (IS_ERR(expiring)) return PTR_ERR(expiring); if (!expiring) - return autofs4_expire_wait(dentry, rcu_walk); + return autofs4_expire_wait(path, rcu_walk); else { + const struct path this = { .mnt = path->mnt, .dentry = expiring }; /* * If we are racing with expire the request might not * be quite complete, but the directory has been removed * so it must have been successful, just wait for it. */ - autofs4_expire_wait(expiring, 0); + autofs4_expire_wait(&this, 0); autofs4_del_expiring(expiring); dput(expiring); } @@ -354,7 +356,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) * and the directory was removed, so just go ahead and try * the mount. */ - status = do_expire_wait(dentry, 0); + status = do_expire_wait(path, 0); if (status && status != -EAGAIN) return NULL; @@ -362,7 +364,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) spin_lock(&sbi->fs_lock); if (ino->flags & AUTOFS_INF_PENDING) { spin_unlock(&sbi->fs_lock); - status = autofs4_mount_wait(dentry, 0); + status = autofs4_mount_wait(path, 0); if (status) return ERR_PTR(status); goto done; @@ -370,28 +372,28 @@ static struct vfsmount *autofs4_d_automount(struct path *path) /* * If the dentry is a symlink it's equivalent to a directory - * having d_mountpoint() true, so there's no need to call back - * to the daemon. + * having path_is_mountpoint() true, so there's no need to call + * back to the daemon. */ if (d_really_is_positive(dentry) && d_is_symlink(dentry)) { spin_unlock(&sbi->fs_lock); goto done; } - if (!d_mountpoint(dentry)) { + if (!path_is_mountpoint(path)) { /* * It's possible that user space hasn't removed directories * after umounting a rootless multi-mount, although it - * should. For v5 have_submounts() is sufficient to handle - * this because the leaves of the directory tree under the - * mount never trigger mounts themselves (they have an autofs - * trigger mount mounted on them). But v4 pseudo direct mounts - * do need the leaves to trigger mounts. In this case we - * have no choice but to use the list_empty() check and + * should. For v5 path_has_submounts() is sufficient to + * handle this because the leaves of the directory tree under + * the mount never trigger mounts themselves (they have an + * autofs trigger mount mounted on them). But v4 pseudo direct + * mounts do need the leaves to trigger mounts. In this case + * we have no choice but to use the list_empty() check and * require user space behave. */ if (sbi->version > 4) { - if (have_submounts(dentry)) { + if (path_has_submounts(path)) { spin_unlock(&sbi->fs_lock); goto done; } @@ -403,7 +405,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) } ino->flags |= AUTOFS_INF_PENDING; spin_unlock(&sbi->fs_lock); - status = autofs4_mount_wait(dentry, 0); + status = autofs4_mount_wait(path, 0); spin_lock(&sbi->fs_lock); ino->flags &= ~AUTOFS_INF_PENDING; if (status) { @@ -421,8 +423,9 @@ done: return NULL; } -static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) +static int autofs4_d_manage(const struct path *path, bool rcu_walk) { + struct dentry *dentry = path->dentry; struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); int status; @@ -431,20 +434,20 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) /* The daemon never waits. */ if (autofs4_oz_mode(sbi)) { - if (!d_mountpoint(dentry)) + if (!path_is_mountpoint(path)) return -EISDIR; return 0; } /* Wait for pending expires */ - if (do_expire_wait(dentry, rcu_walk) == -ECHILD) + if (do_expire_wait(path, rcu_walk) == -ECHILD) return -ECHILD; /* * This dentry may be under construction so wait on mount * completion. */ - status = autofs4_mount_wait(dentry, rcu_walk); + status = autofs4_mount_wait(path, rcu_walk); if (status) return status; @@ -460,7 +463,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) if (ino->flags & AUTOFS_INF_WANT_EXPIRE) return 0; - if (d_mountpoint(dentry)) + if (path_is_mountpoint(path)) return 0; inode = d_inode_rcu(dentry); if (inode && S_ISLNK(inode->i_mode)) @@ -487,7 +490,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) * we can avoid needless calls ->d_automount() and avoid * an incorrect ELOOP error return. */ - if ((!d_mountpoint(dentry) && !simple_empty(dentry)) || + if ((!path_is_mountpoint(path) && !simple_empty(dentry)) || (d_really_is_positive(dentry) && d_is_symlink(dentry))) status = -EISDIR; } diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index e44271dfceb6..1278335ce366 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -250,8 +250,9 @@ autofs4_find_wait(struct autofs_sb_info *sbi, const struct qstr *qstr) static int validate_request(struct autofs_wait_queue **wait, struct autofs_sb_info *sbi, const struct qstr *qstr, - struct dentry *dentry, enum autofs_notify notify) + const struct path *path, enum autofs_notify notify) { + struct dentry *dentry = path->dentry; struct autofs_wait_queue *wq; struct autofs_info *ino; @@ -314,6 +315,7 @@ static int validate_request(struct autofs_wait_queue **wait, */ if (notify == NFY_MOUNT) { struct dentry *new = NULL; + struct path this; int valid = 1; /* @@ -333,7 +335,9 @@ static int validate_request(struct autofs_wait_queue **wait, dentry = new; } } - if (have_submounts(dentry)) + this.mnt = path->mnt; + this.dentry = dentry; + if (path_has_submounts(&this)) valid = 0; if (new) @@ -345,8 +349,9 @@ static int validate_request(struct autofs_wait_queue **wait, } int autofs4_wait(struct autofs_sb_info *sbi, - struct dentry *dentry, enum autofs_notify notify) + const struct path *path, enum autofs_notify notify) { + struct dentry *dentry = path->dentry; struct autofs_wait_queue *wq; struct qstr qstr; char *name; @@ -405,7 +410,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, return -EINTR; } - ret = validate_request(&wq, sbi, &qstr, dentry, notify); + ret = validate_request(&wq, sbi, &qstr, path, notify); if (ret <= 0) { if (ret != -EINTR) mutex_unlock(&sbi->wq_mutex); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 50bcfb80d33a..6a823719b6c5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3232,9 +3232,6 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, struct extent_state **cached); int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); -ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - size_t len, unsigned int flags); int btrfs_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, u64 len); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 448f57d108d1..b5c5da215d05 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3033,7 +3033,6 @@ const struct file_operations btrfs_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = btrfs_compat_ioctl, #endif - .copy_file_range = btrfs_copy_file_range, .clone_file_range = btrfs_clone_file_range, .dedupe_file_range = btrfs_dedupe_file_range, }; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0a6902555e65..33f967d30b2a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -834,7 +834,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) * sys_mkdirat and vfs_mkdir, but we only do a single component lookup * inside this filesystem so it's quite a bit simpler. */ -static noinline int btrfs_mksubvol(struct path *parent, +static noinline int btrfs_mksubvol(const struct path *parent, char *name, int namelen, struct btrfs_root *snap_src, u64 *async_transid, bool readonly, @@ -3987,18 +3987,6 @@ out_unlock: return ret; } -ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - size_t len, unsigned int flags) -{ - ssize_t ret; - - ret = btrfs_clone_files(file_out, file_in, pos_in, len, pos_out); - if (ret == 0) - ret = len; - return ret; -} - int btrfs_clone_file_range(struct file *src_file, loff_t off, struct file *dst_file, loff_t destoff, u64 len) { diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index a0f1e2b91c8e..9cd0c0ea7cdb 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1317,25 +1317,27 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = file_inode(file); - unsigned from = pos & (PAGE_SIZE - 1); int check_cap = 0; dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, inode, page, (int)pos, (int)copied, (int)len); /* zero the stale part of the page if we did a short copy */ - if (copied < len) - zero_user_segment(page, from+copied, len); + if (!PageUptodate(page)) { + if (copied < len) { + copied = 0; + goto out; + } + SetPageUptodate(page); + } /* did file size increase? */ if (pos+copied > i_size_read(inode)) check_cap = ceph_inode_set_size(inode, pos+copied); - if (!PageUptodate(page)) - SetPageUptodate(page); - set_page_dirty(page); +out: unlock_page(page); put_page(page); diff --git a/fs/dcache.c b/fs/dcache.c index 5c7cc953ac81..252378359a8f 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1273,38 +1273,44 @@ rename_retry: goto again; } -/* - * Search for at least 1 mount point in the dentry's subdirs. - * We descend to the next level whenever the d_subdirs - * list is non-empty and continue searching. - */ +struct check_mount { + struct vfsmount *mnt; + unsigned int mounted; +}; -static enum d_walk_ret check_mount(void *data, struct dentry *dentry) +static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry) { - int *ret = data; - if (d_mountpoint(dentry)) { - *ret = 1; + struct check_mount *info = data; + struct path path = { .mnt = info->mnt, .dentry = dentry }; + + if (likely(!d_mountpoint(dentry))) + return D_WALK_CONTINUE; + if (__path_is_mountpoint(&path)) { + info->mounted = 1; return D_WALK_QUIT; } return D_WALK_CONTINUE; } /** - * have_submounts - check for mounts over a dentry - * @parent: dentry to check. + * path_has_submounts - check for mounts over a dentry in the + * current namespace. + * @parent: path to check. * * Return true if the parent or its subdirectories contain - * a mount point + * a mount point in the current namespace. */ -int have_submounts(struct dentry *parent) +int path_has_submounts(const struct path *parent) { - int ret = 0; + struct check_mount data = { .mnt = parent->mnt, .mounted = 0 }; - d_walk(parent, &ret, check_mount, NULL); + read_seqlock_excl(&mount_lock); + d_walk(parent->dentry, &data, path_check_mount, NULL); + read_sequnlock_excl(&mount_lock); - return ret; + return data.mounted; } -EXPORT_SYMBOL(have_submounts); +EXPORT_SYMBOL(path_has_submounts); /* * Called by mount code to set a mountpoint and check if the mountpoint is diff --git a/fs/dcookies.c b/fs/dcookies.c index ac44a69fbea9..a26a701ef512 100644 --- a/fs/dcookies.c +++ b/fs/dcookies.c @@ -90,7 +90,7 @@ static void hash_dcookie(struct dcookie_struct * dcs) } -static struct dcookie_struct *alloc_dcookie(struct path *path) +static struct dcookie_struct *alloc_dcookie(const struct path *path) { struct dcookie_struct *dcs = kmem_cache_alloc(dcookie_cache, GFP_KERNEL); @@ -113,7 +113,7 @@ static struct dcookie_struct *alloc_dcookie(struct path *path) /* This is the main kernel-side routine that retrieves the cookie * value for a dentry/vfsmnt pair. */ -int get_dcookie(struct path *path, unsigned long *cookie) +int get_dcookie(const struct path *path, unsigned long *cookie) { int err = 0; struct dcookie_struct * dcs; diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index d8072bc074a4..0ac62811b341 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -870,46 +870,31 @@ int exofs_write_begin(struct file *file, struct address_space *mapping, page = *pagep; if (page == NULL) { - ret = simple_write_begin(file, mapping, pos, len, flags, pagep, - fsdata); - if (ret) { - EXOFS_DBGMSG("simple_write_begin failed\n"); - goto out; + page = grab_cache_page_write_begin(mapping, pos >> PAGE_SHIFT, + flags); + if (!page) { + EXOFS_DBGMSG("grab_cache_page_write_begin failed\n"); + return -ENOMEM; } - - page = *pagep; + *pagep = page; } /* read modify write */ if (!PageUptodate(page) && (len != PAGE_SIZE)) { loff_t i_size = i_size_read(mapping->host); pgoff_t end_index = i_size >> PAGE_SHIFT; - size_t rlen; - if (page->index < end_index) - rlen = PAGE_SIZE; - else if (page->index == end_index) - rlen = i_size & ~PAGE_MASK; - else - rlen = 0; - - if (!rlen) { + if (page->index > end_index) { clear_highpage(page); SetPageUptodate(page); - goto out; - } - - ret = _readpage(page, true); - if (ret) { - /*SetPageError was done by _readpage. Is it ok?*/ - unlock_page(page); - EXOFS_DBGMSG("__readpage failed\n"); + } else { + ret = _readpage(page, true); + if (ret) { + unlock_page(page); + EXOFS_DBGMSG("__readpage failed\n"); + } } } -out: - if (unlikely(ret)) - _write_failed(mapping->host, pos + len); - return ret; } @@ -929,18 +914,25 @@ static int exofs_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = mapping->host; - /* According to comment in simple_write_end i_mutex is held */ - loff_t i_size = inode->i_size; - int ret; - - ret = simple_write_end(file, mapping,pos, len, copied, page, fsdata); - if (unlikely(ret)) - _write_failed(inode, pos + len); + loff_t last_pos = pos + copied; - /* TODO: once simple_write_end marks inode dirty remove */ - if (i_size != inode->i_size) + if (!PageUptodate(page)) { + if (copied < len) { + _write_failed(inode, pos + len); + copied = 0; + goto out; + } + SetPageUptodate(page); + } + if (last_pos > inode->i_size) { + i_size_write(inode, last_pos); mark_inode_dirty(inode); - return ret; + } + set_page_dirty(page); +out: + unlock_page(page); + put_page(page); + return copied; } static int exofs_releasepage(struct page *page, gfp_t gfp) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dfc8309d7755..63a6b6332682 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1205,7 +1205,7 @@ static int ext4_release_dquot(struct dquot *dquot); static int ext4_mark_dquot_dirty(struct dquot *dquot); static int ext4_write_info(struct super_block *sb, int type); static int ext4_quota_on(struct super_block *sb, int type, int format_id, - struct path *path); + const struct path *path); static int ext4_quota_off(struct super_block *sb, int type); static int ext4_quota_on_mount(struct super_block *sb, int type); static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, @@ -5293,7 +5293,7 @@ static void lockdep_set_quota_inode(struct inode *inode, int subclass) * Standard function to be called on quota_on */ static int ext4_quota_on(struct super_block *sb, int type, int format_id, - struct path *path) + const struct path *path) { int err; diff --git a/fs/file_table.c b/fs/file_table.c index ad17e05ebf95..6d982b57de92 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -155,7 +155,7 @@ over: * @mode: the mode with which the new file will be opened * @fop: the 'struct file_operations' for the new file */ -struct file *alloc_file(struct path *path, fmode_t mode, +struct file *alloc_file(const struct path *path, fmode_t mode, const struct file_operations *fop) { struct file *file; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 5a6f52ea2722..6b039d7ce160 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -839,12 +839,10 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode))); kaddr = kmap_atomic(page); memcpy(buf + pos, kaddr + pos, copied); - memset(kaddr + pos + copied, 0, len - copied); flush_dcache_page(page); kunmap_atomic(kaddr); - if (!PageUptodate(page)) - SetPageUptodate(page); + WARN_ON(!PageUptodate(page)); unlock_page(page); put_page(page); diff --git a/fs/internal.h b/fs/internal.h index 4fcf51766d4a..b63cf3af2dc2 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -62,7 +62,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *, extern void *copy_mount_options(const void __user *); extern char *copy_mount_string(const void __user *); -extern struct vfsmount *lookup_mnt(struct path *); +extern struct vfsmount *lookup_mnt(const struct path *); extern int finish_automount(struct vfsmount *, struct path *); extern int sb_prepare_remount_readonly(struct super_block *); diff --git a/fs/libfs.c b/fs/libfs.c index 48826d4da189..76048705d922 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -465,6 +465,8 @@ EXPORT_SYMBOL(simple_write_begin); * is not called, so a filesystem that actually does store data in .write_inode * should extend on what's done here with a call to mark_inode_dirty() in the * case that i_size has changed. + * + * Use *ONLY* with simple_readpage() */ int simple_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, @@ -474,14 +476,14 @@ int simple_write_end(struct file *file, struct address_space *mapping, loff_t last_pos = pos + copied; /* zero the stale part of the page if we did a short copy */ - if (copied < len) { - unsigned from = pos & (PAGE_SIZE - 1); - - zero_user(page, from + copied, len - copied); - } + if (!PageUptodate(page)) { + if (copied < len) { + unsigned from = pos & (PAGE_SIZE - 1); - if (!PageUptodate(page)) + zero_user(page, from + copied, len - copied); + } SetPageUptodate(page); + } /* * No need to use i_size_read() here, the i_size * cannot change under us because we hold the i_mutex. diff --git a/fs/mount.h b/fs/mount.h index d2e25d7b64b3..2c856fc47ae3 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -94,6 +94,12 @@ extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *); extern int __legitimize_mnt(struct vfsmount *, unsigned); extern bool legitimize_mnt(struct vfsmount *, unsigned); +static inline bool __path_is_mountpoint(const struct path *path) +{ + struct mount *m = __lookup_mnt(path->mnt, path->dentry); + return m && likely(!(m->mnt.mnt_flags & MNT_SYNC_UMOUNT)); +} + extern void __detach_mounts(struct dentry *dentry); static inline void detach_mounts(struct dentry *dentry) diff --git a/fs/namei.c b/fs/namei.c index 2b55ea142273..1c372debcbbe 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1200,7 +1200,7 @@ static int follow_managed(struct path *path, struct nameidata *nd) if (managed & DCACHE_MANAGE_TRANSIT) { BUG_ON(!path->dentry->d_op); BUG_ON(!path->dentry->d_op->d_manage); - ret = path->dentry->d_op->d_manage(path->dentry, false); + ret = path->dentry->d_op->d_manage(path, false); if (ret < 0) break; } @@ -1263,10 +1263,10 @@ int follow_down_one(struct path *path) } EXPORT_SYMBOL(follow_down_one); -static inline int managed_dentry_rcu(struct dentry *dentry) +static inline int managed_dentry_rcu(const struct path *path) { - return (dentry->d_flags & DCACHE_MANAGE_TRANSIT) ? - dentry->d_op->d_manage(dentry, true) : 0; + return (path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) ? + path->dentry->d_op->d_manage(path, true) : 0; } /* @@ -1282,7 +1282,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, * Don't forget we might have a non-mountpoint managed dentry * that wants to block transit. */ - switch (managed_dentry_rcu(path->dentry)) { + switch (managed_dentry_rcu(path)) { case -ECHILD: default: return false; @@ -1392,8 +1392,7 @@ int follow_down(struct path *path) if (managed & DCACHE_MANAGE_TRANSIT) { BUG_ON(!path->dentry->d_op); BUG_ON(!path->dentry->d_op->d_manage); - ret = path->dentry->d_op->d_manage( - path->dentry, false); + ret = path->dentry->d_op->d_manage(path, false); if (ret < 0) return ret == -EISDIR ? 0 : ret; } @@ -2863,7 +2862,7 @@ bool may_open_dev(const struct path *path) !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV); } -static int may_open(struct path *path, int acc_mode, int flag) +static int may_open(const struct path *path, int acc_mode, int flag) { struct dentry *dentry = path->dentry; struct inode *inode = dentry->d_inode; @@ -2913,7 +2912,7 @@ static int may_open(struct path *path, int acc_mode, int flag) static int handle_truncate(struct file *filp) { - struct path *path = &filp->f_path; + const struct path *path = &filp->f_path; struct inode *inode = path->dentry->d_inode; int error = get_write_access(inode); if (error) diff --git a/fs/namespace.c b/fs/namespace.c index e6c234b1a645..f7e28f8ea04d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -678,7 +678,7 @@ out: * * lookup_mnt takes a reference to the found vfsmount. */ -struct vfsmount *lookup_mnt(struct path *path) +struct vfsmount *lookup_mnt(const struct path *path) { struct mount *child_mnt; struct vfsmount *m; @@ -1159,7 +1159,36 @@ struct vfsmount *mntget(struct vfsmount *mnt) } EXPORT_SYMBOL(mntget); -struct vfsmount *mnt_clone_internal(struct path *path) +/* path_is_mountpoint() - Check if path is a mount in the current + * namespace. + * + * d_mountpoint() can only be used reliably to establish if a dentry is + * not mounted in any namespace and that common case is handled inline. + * d_mountpoint() isn't aware of the possibility there may be multiple + * mounts using a given dentry in a different namespace. This function + * checks if the passed in path is a mountpoint rather than the dentry + * alone. + */ +bool path_is_mountpoint(const struct path *path) +{ + unsigned seq; + bool res; + + if (!d_mountpoint(path->dentry)) + return false; + + rcu_read_lock(); + do { + seq = read_seqbegin(&mount_lock); + res = __path_is_mountpoint(path); + } while (read_seqretry(&mount_lock, seq)); + rcu_read_unlock(); + + return res; +} +EXPORT_SYMBOL(path_is_mountpoint); + +struct vfsmount *mnt_clone_internal(const struct path *path) { struct mount *p; p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE); @@ -1758,7 +1787,7 @@ out: /* Caller should check returned pointer for errors */ -struct vfsmount *collect_mounts(struct path *path) +struct vfsmount *collect_mounts(const struct path *path) { struct mount *tree; namespace_lock(); @@ -1791,7 +1820,7 @@ void drop_collected_mounts(struct vfsmount *mnt) * * Release with mntput(). */ -struct vfsmount *clone_private_mount(struct path *path) +struct vfsmount *clone_private_mount(const struct path *path) { struct mount *old_mnt = real_mount(path->mnt); struct mount *new_mnt; @@ -2997,7 +3026,7 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry, return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry); } -bool path_is_under(struct path *path1, struct path *path2) +bool path_is_under(const struct path *path1, const struct path *path2) { bool res; read_seqlock_excl(&mount_lock); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 64c11f399b3d..55208b9b3c11 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -377,7 +377,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, */ if (!PageUptodate(page)) { unsigned pglen = nfs_page_length(page); - unsigned end = offset + len; + unsigned end = offset + copied; if (pglen == 0) { zero_user_segments(page, 0, offset, diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 6faaf710e563..5a4ec309e283 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -85,7 +85,7 @@ static int dnotify_handle_event(struct fsnotify_group *group, struct inode *inode, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - u32 mask, void *data, int data_type, + u32 mask, const void *data, int data_type, const unsigned char *file_name, u32 cookie) { struct dnotify_mark *dn_mark; diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index e0e5f7c3c99f..bbc175d4213d 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -90,10 +90,10 @@ static int fanotify_get_response(struct fsnotify_group *group, static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmnt_mark, u32 event_mask, - void *data, int data_type) + const void *data, int data_type) { __u32 marks_mask, marks_ignored_mask; - struct path *path = data; + const struct path *path = data; pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p" " data_type=%d\n", __func__, inode_mark, vfsmnt_mark, @@ -140,7 +140,7 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, } struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask, - struct path *path) + const struct path *path) { struct fanotify_event_info *event; @@ -177,7 +177,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, struct inode *inode, struct fsnotify_mark *inode_mark, struct fsnotify_mark *fanotify_mark, - u32 mask, void *data, int data_type, + u32 mask, const void *data, int data_type, const unsigned char *file_name, u32 cookie) { int ret = 0; diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 2a5fb14115df..4500a74f8d38 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -47,4 +47,4 @@ static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse) } struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask, - struct path *path); + const struct path *path); diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index db39de2dd4cb..b41515d3f081 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -86,7 +86,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode) } /* Notify this dentry's parent about a child's events. */ -int __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask) +int __fsnotify_parent(const struct path *path, struct dentry *dentry, __u32 mask) { struct dentry *parent; struct inode *p_inode; @@ -125,7 +125,7 @@ EXPORT_SYMBOL_GPL(__fsnotify_parent); static int send_to_group(struct inode *to_tell, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, + __u32 mask, const void *data, int data_is, u32 cookie, const unsigned char *file_name) { @@ -187,7 +187,7 @@ static int send_to_group(struct inode *to_tell, * out to all of the registered fsnotify_group. Those groups can then use the * notification event in whatever means they feel necessary. */ -int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, +int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, const unsigned char *file_name, u32 cookie) { struct hlist_node *inode_node = NULL, *vfsmount_node = NULL; @@ -199,7 +199,7 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); if (data_is == FSNOTIFY_EVENT_PATH) - mnt = real_mount(((struct path *)data)->mnt); + mnt = real_mount(((const struct path *)data)->mnt); else mnt = NULL; diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index ed855ef6f077..a6f5907a3fee 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -26,7 +26,7 @@ extern int inotify_handle_event(struct fsnotify_group *group, struct inode *inode, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - u32 mask, void *data, int data_type, + u32 mask, const void *data, int data_type, const unsigned char *file_name, u32 cookie); extern const struct fsnotify_ops inotify_fsnotify_ops; diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 2cd900c2c737..19e7ec109a75 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -66,7 +66,7 @@ int inotify_handle_event(struct fsnotify_group *group, struct inode *inode, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - u32 mask, void *data, int data_type, + u32 mask, const void *data, int data_type, const unsigned char *file_name, u32 cookie) { struct inotify_inode_mark *i_mark; @@ -80,7 +80,7 @@ int inotify_handle_event(struct fsnotify_group *group, if ((inode_mark->mask & FS_EXCL_UNLINK) && (data_type == FSNOTIFY_EVENT_PATH)) { - struct path *path = data; + const struct path *path = data; if (d_unlinked(path->dentry)) return 0; diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index f72712f6c28d..d4ec0d8961a6 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5194,7 +5194,7 @@ int ocfs2_change_extent_flag(handle_t *handle, rec = &el->l_recs[index]; if (new_flags && (rec->e_flags & new_flags)) { mlog(ML_ERROR, "Owner %llu tried to set %d flags on an " - "extent that already had them", + "extent that already had them\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), new_flags); goto out; @@ -5202,7 +5202,7 @@ int ocfs2_change_extent_flag(handle_t *handle, if (clear_flags && !(rec->e_flags & clear_flags)) { mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an " - "extent that didn't have them", + "extent that didn't have them\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), clear_flags); goto out; @@ -5713,8 +5713,7 @@ int ocfs2_remove_btree_range(struct inode *inode, struct ocfs2_refcount_tree *ref_tree = NULL; if ((flags & OCFS2_EXT_REFCOUNTED) && len) { - BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & - OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!ocfs2_is_refcount_inode(inode)); if (!refcount_tree_locked) { ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 4d9c6f5ec28a..11556b7d93ec 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -464,6 +464,15 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) trace_ocfs2_bmap((unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)block); + /* + * The swap code (ab-)uses ->bmap to get a block mapping and then + * bypasseѕ the file system for actual I/O. We really can't allow + * that on refcounted inodes, so we have to skip out here. And yes, + * 0 is the magic code for a bmap error.. + */ + if (ocfs2_is_refcount_inode(inode)) + return 0; + /* We don't need to lock journal system files, since they aren't * accessed concurrently from multiple nodes. */ @@ -2253,10 +2262,10 @@ out: return ret; } -static void ocfs2_dio_end_io_write(struct inode *inode, - struct ocfs2_dio_write_ctxt *dwc, - loff_t offset, - ssize_t bytes) +static int ocfs2_dio_end_io_write(struct inode *inode, + struct ocfs2_dio_write_ctxt *dwc, + loff_t offset, + ssize_t bytes) { struct ocfs2_cached_dealloc_ctxt dealloc; struct ocfs2_extent_tree et; @@ -2307,7 +2316,7 @@ static void ocfs2_dio_end_io_write(struct inode *inode, mlog_errno(ret); } - di = (struct ocfs2_dinode *)di_bh; + di = (struct ocfs2_dinode *)di_bh->b_data; ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); @@ -2364,6 +2373,8 @@ out: if (locked) inode_unlock(inode); ocfs2_dio_free_write_ctx(inode, dwc); + + return ret; } /* @@ -2378,21 +2389,19 @@ static int ocfs2_dio_end_io(struct kiocb *iocb, { struct inode *inode = file_inode(iocb->ki_filp); int level; - - if (bytes <= 0) - return 0; + int ret = 0; /* this io's submitter should not have unlocked this before we could */ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); - if (private) - ocfs2_dio_end_io_write(inode, private, offset, bytes); + if (bytes > 0 && private) + ret = ocfs2_dio_end_io_write(inode, private, offset, bytes); ocfs2_iocb_clear_rw_locked(iocb); level = ocfs2_iocb_rw_locked_level(iocb); ocfs2_rw_unlock(inode, level); - return 0; + return ret; } static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 000c234d7bbd..c4889655d32b 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1030,7 +1030,7 @@ int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, * Only quota files call this without a bh, and they can't be * refcounted. */ - BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!di_bh && ocfs2_is_refcount_inode(inode)); BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE)); clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size); @@ -1667,9 +1667,9 @@ static void ocfs2_calc_trunc_pos(struct inode *inode, *done = ret; } -static int ocfs2_remove_inode_range(struct inode *inode, - struct buffer_head *di_bh, u64 byte_start, - u64 byte_len) +int ocfs2_remove_inode_range(struct inode *inode, + struct buffer_head *di_bh, u64 byte_start, + u64 byte_len) { int ret = 0, flags = 0, done = 0, i; u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos; @@ -1719,8 +1719,7 @@ static int ocfs2_remove_inode_range(struct inode *inode, * within one cluster(means is not exactly aligned to clustersize). */ - if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) { - + if (ocfs2_is_refcount_inode(inode)) { ret = ocfs2_cow_file_pos(inode, di_bh, byte_start); if (ret) { mlog_errno(ret); @@ -2036,7 +2035,7 @@ int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos, struct super_block *sb = inode->i_sb; if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) || - !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) || + !ocfs2_is_refcount_inode(inode) || OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) return 0; @@ -2440,6 +2439,31 @@ out: return offset; } +static int ocfs2_file_clone_range(struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + u64 len) +{ + return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out, + len, false); +} + +static ssize_t ocfs2_file_dedupe_range(struct file *src_file, + u64 loff, + u64 len, + struct file *dst_file, + u64 dst_loff) +{ + int error; + + error = ocfs2_reflink_remap_range(src_file, loff, dst_file, dst_loff, + len, true); + if (error) + return error; + return len; +} + const struct inode_operations ocfs2_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, @@ -2479,6 +2503,8 @@ const struct file_operations ocfs2_fops = { .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, + .clone_file_range = ocfs2_file_clone_range, + .dedupe_file_range = ocfs2_file_dedupe_range, }; const struct file_operations ocfs2_dops = { @@ -2524,6 +2550,8 @@ const struct file_operations ocfs2_fops_no_plocks = { .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, + .clone_file_range = ocfs2_file_clone_range, + .dedupe_file_range = ocfs2_file_dedupe_range, }; const struct file_operations ocfs2_dops_no_plocks = { diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index e8c62f22215c..897fd9a2e51d 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -82,4 +82,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd, int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos, size_t count); +int ocfs2_remove_inode_range(struct inode *inode, + struct buffer_head *di_bh, u64 byte_start, + u64 byte_len); #endif /* OCFS2_FILE_H */ diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 5af68fcdf9d3..9b955f732bca 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -181,4 +181,10 @@ static inline struct ocfs2_inode_info *cache_info_to_inode(struct ocfs2_caching_ return container_of(ci, struct ocfs2_inode_info, ip_metadata_cache); } +/* Does this inode have the reflink flag set? */ +static inline bool ocfs2_is_refcount_inode(struct inode *inode) +{ + return (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); +} + #endif /* OCFS2_INODE_H */ diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 4e8f32eb0bdb..e52a2852d50d 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -235,10 +235,7 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) { - - BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & - OCFS2_HAS_REFCOUNT_FL)); - + BUG_ON(!ocfs2_is_refcount_inode(inode)); BUG_ON(!context->refcount_loc); ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, @@ -581,10 +578,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) { - - BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & - OCFS2_HAS_REFCOUNT_FL)); - + BUG_ON(!ocfs2_is_refcount_inode(inode)); BUG_ON(!context->refcount_loc); ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 738b4ea8e990..d171d2c53f7f 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -34,6 +34,7 @@ #include "xattr.h" #include "namei.h" #include "ocfs2_trace.h" +#include "file.h" #include <linux/bio.h> #include <linux/blkdev.h> @@ -410,7 +411,7 @@ static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno) goto out; } - BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!ocfs2_is_refcount_inode(inode)); di = (struct ocfs2_dinode *)di_bh->b_data; *ref_blkno = le64_to_cpu(di->i_refcount_loc); @@ -569,7 +570,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode, u32 num_got; u64 suballoc_loc, first_blkno; - BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); + BUG_ON(ocfs2_is_refcount_inode(inode)); trace_ocfs2_create_refcount_tree( (unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -707,7 +708,7 @@ static int ocfs2_set_refcount_tree(struct inode *inode, struct ocfs2_refcount_block *rb; struct ocfs2_refcount_tree *ref_tree; - BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); + BUG_ON(ocfs2_is_refcount_inode(inode)); ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, &ref_tree, &ref_root_bh); @@ -774,7 +775,7 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh) u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc); u16 bit = 0; - if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) + if (!ocfs2_is_refcount_inode(inode)) return 0; BUG_ON(!ref_blkno); @@ -2298,11 +2299,10 @@ int ocfs2_decrease_refcount(struct inode *inode, { int ret; u64 ref_blkno; - struct ocfs2_inode_info *oi = OCFS2_I(inode); struct buffer_head *ref_root_bh = NULL; struct ocfs2_refcount_tree *tree; - BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!ocfs2_is_refcount_inode(inode)); ret = ocfs2_get_refcount_block(inode, &ref_blkno); if (ret) { @@ -2532,7 +2532,6 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, int *ref_blocks) { int ret; - struct ocfs2_inode_info *oi = OCFS2_I(inode); struct buffer_head *ref_root_bh = NULL; struct ocfs2_refcount_tree *tree; u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno); @@ -2543,7 +2542,7 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, goto out; } - BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!ocfs2_is_refcount_inode(inode)); ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), refcount_loc, &tree); @@ -3411,14 +3410,13 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode, { int ret; u32 cow_start = 0, cow_len = 0; - struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct buffer_head *ref_root_bh = NULL; struct ocfs2_refcount_tree *ref_tree; struct ocfs2_cow_context *context = NULL; - BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!ocfs2_is_refcount_inode(inode)); ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list, cpos, write_len, max_cpos, @@ -3628,11 +3626,10 @@ int ocfs2_refcount_cow_xattr(struct inode *inode, { int ret; struct ocfs2_xattr_value_root *xv = vb->vb_xv; - struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_cow_context *context = NULL; u32 cow_start, cow_len; - BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!ocfs2_is_refcount_inode(inode)); ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list, cpos, write_len, UINT_MAX, @@ -3695,6 +3692,9 @@ int ocfs2_add_refcount_flag(struct inode *inode, struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_alloc_context *meta_ac = NULL; + /* We need to be able to handle at least an extent tree split. */ + ref_blocks = ocfs2_extend_meta_needed(data_et->et_root_el); + ret = ocfs2_calc_refcount_meta_credits(inode->i_sb, ref_ci, ref_root_bh, p_cluster, num_clusters, @@ -3806,7 +3806,7 @@ static int ocfs2_attach_refcount_tree(struct inode *inode, ocfs2_init_dealloc_ctxt(&dealloc); - if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) { + if (!ocfs2_is_refcount_inode(inode)) { ret = ocfs2_create_refcount_tree(inode, di_bh); if (ret) { mlog_errno(ret); @@ -3933,6 +3933,13 @@ static int ocfs2_add_refcounted_extent(struct inode *inode, ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh, p_cluster, num_clusters, meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = dquot_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, num_clusters)); if (ret) mlog_errno(ret); @@ -4441,3 +4448,434 @@ out: return error; } + +/* Update destination inode size, if necessary. */ +static int ocfs2_reflink_update_dest(struct inode *dest, + struct buffer_head *d_bh, + loff_t newlen) +{ + handle_t *handle; + int ret; + + dest->i_blocks = ocfs2_inode_sector_count(dest); + + if (newlen <= i_size_read(dest)) + return 0; + + handle = ocfs2_start_trans(OCFS2_SB(dest->i_sb), + OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + return ret; + } + + /* Extend i_size if needed. */ + spin_lock(&OCFS2_I(dest)->ip_lock); + if (newlen > i_size_read(dest)) + i_size_write(dest, newlen); + spin_unlock(&OCFS2_I(dest)->ip_lock); + dest->i_ctime = dest->i_mtime = current_time(dest); + + ret = ocfs2_mark_inode_dirty(handle, dest, d_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + +out_commit: + ocfs2_commit_trans(OCFS2_SB(dest->i_sb), handle); + return ret; +} + +/* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */ +static int ocfs2_reflink_remap_extent(struct inode *s_inode, + struct buffer_head *s_bh, + loff_t pos_in, + struct inode *t_inode, + struct buffer_head *t_bh, + loff_t pos_out, + loff_t len, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + struct ocfs2_extent_tree s_et; + struct ocfs2_extent_tree t_et; + struct ocfs2_dinode *dis; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_tree *ref_tree; + struct ocfs2_super *osb; + loff_t pstart, plen; + u32 p_cluster, num_clusters, slast, spos, tpos; + unsigned int ext_flags; + int ret = 0; + + osb = OCFS2_SB(s_inode->i_sb); + dis = (struct ocfs2_dinode *)s_bh->b_data; + ocfs2_init_dinode_extent_tree(&s_et, INODE_CACHE(s_inode), s_bh); + ocfs2_init_dinode_extent_tree(&t_et, INODE_CACHE(t_inode), t_bh); + + spos = ocfs2_bytes_to_clusters(s_inode->i_sb, pos_in); + tpos = ocfs2_bytes_to_clusters(t_inode->i_sb, pos_out); + slast = ocfs2_clusters_for_bytes(s_inode->i_sb, pos_in + len); + + while (spos < slast) { + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } + + /* Look up the extent. */ + ret = ocfs2_get_clusters(s_inode, spos, &p_cluster, + &num_clusters, &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + num_clusters = min_t(u32, num_clusters, slast - spos); + + /* Punch out the dest range. */ + pstart = ocfs2_clusters_to_bytes(t_inode->i_sb, tpos); + plen = ocfs2_clusters_to_bytes(t_inode->i_sb, num_clusters); + ret = ocfs2_remove_inode_range(t_inode, t_bh, pstart, plen); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (p_cluster == 0) + goto next_loop; + + /* Lock the refcount btree... */ + ret = ocfs2_lock_refcount_tree(osb, + le64_to_cpu(dis->i_refcount_loc), + 1, &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* Mark s_inode's extent as refcounted. */ + if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) { + ret = ocfs2_add_refcount_flag(s_inode, &s_et, + &ref_tree->rf_ci, + ref_root_bh, spos, + p_cluster, num_clusters, + dealloc, NULL); + if (ret) { + mlog_errno(ret); + goto out_unlock_refcount; + } + } + + /* Map in the new extent. */ + ext_flags |= OCFS2_EXT_REFCOUNTED; + ret = ocfs2_add_refcounted_extent(t_inode, &t_et, + &ref_tree->rf_ci, + ref_root_bh, + tpos, p_cluster, + num_clusters, + ext_flags, + dealloc); + if (ret) { + mlog_errno(ret); + goto out_unlock_refcount; + } + + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_root_bh); +next_loop: + spos += num_clusters; + tpos += num_clusters; + } + +out: + return ret; +out_unlock_refcount: + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_root_bh); + return ret; +} + +/* Set up refcount tree and remap s_inode to t_inode. */ +static int ocfs2_reflink_remap_blocks(struct inode *s_inode, + struct buffer_head *s_bh, + loff_t pos_in, + struct inode *t_inode, + struct buffer_head *t_bh, + loff_t pos_out, + loff_t len) +{ + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_super *osb; + struct ocfs2_dinode *dis; + struct ocfs2_dinode *dit; + int ret; + + osb = OCFS2_SB(s_inode->i_sb); + dis = (struct ocfs2_dinode *)s_bh->b_data; + dit = (struct ocfs2_dinode *)t_bh->b_data; + ocfs2_init_dealloc_ctxt(&dealloc); + + /* + * If we're reflinking the entire file and the source is inline + * data, just copy the contents. + */ + if (pos_in == pos_out && pos_in == 0 && len == i_size_read(s_inode) && + i_size_read(t_inode) <= len && + (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) { + ret = ocfs2_duplicate_inline_data(s_inode, s_bh, t_inode, t_bh); + if (ret) + mlog_errno(ret); + goto out; + } + + /* + * If both inodes belong to two different refcount groups then + * forget it because we don't know how (or want) to go merging + * refcount trees. + */ + ret = -EOPNOTSUPP; + if (ocfs2_is_refcount_inode(s_inode) && + ocfs2_is_refcount_inode(t_inode) && + le64_to_cpu(dis->i_refcount_loc) != + le64_to_cpu(dit->i_refcount_loc)) + goto out; + + /* Neither inode has a refcount tree. Add one to s_inode. */ + if (!ocfs2_is_refcount_inode(s_inode) && + !ocfs2_is_refcount_inode(t_inode)) { + ret = ocfs2_create_refcount_tree(s_inode, s_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* Ensure that both inodes end up with the same refcount tree. */ + if (!ocfs2_is_refcount_inode(s_inode)) { + ret = ocfs2_set_refcount_tree(s_inode, s_bh, + le64_to_cpu(dit->i_refcount_loc)); + if (ret) { + mlog_errno(ret); + goto out; + } + } + if (!ocfs2_is_refcount_inode(t_inode)) { + ret = ocfs2_set_refcount_tree(t_inode, t_bh, + le64_to_cpu(dis->i_refcount_loc)); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* Turn off inline data in the dest file. */ + if (OCFS2_I(t_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + ret = ocfs2_convert_inline_data_to_extents(t_inode, t_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* Actually remap extents now. */ + ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh, + pos_out, len, &dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + +out: + if (ocfs2_dealloc_has_cluster(&dealloc)) { + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &dealloc); + } + + return ret; +} + +/* Lock an inode and grab a bh pointing to the inode. */ +static int ocfs2_reflink_inodes_lock(struct inode *s_inode, + struct buffer_head **bh1, + struct inode *t_inode, + struct buffer_head **bh2) +{ + struct inode *inode1; + struct inode *inode2; + struct ocfs2_inode_info *oi1; + struct ocfs2_inode_info *oi2; + bool same_inode = (s_inode == t_inode); + int status; + + /* First grab the VFS and rw locks. */ + lock_two_nondirectories(s_inode, t_inode); + inode1 = s_inode; + inode2 = t_inode; + if (inode1->i_ino > inode2->i_ino) + swap(inode1, inode2); + + status = ocfs2_rw_lock(inode1, 1); + if (status) { + mlog_errno(status); + goto out_i1; + } + if (!same_inode) { + status = ocfs2_rw_lock(inode2, 1); + if (status) { + mlog_errno(status); + goto out_i2; + } + } + + /* Now go for the cluster locks */ + oi1 = OCFS2_I(inode1); + oi2 = OCFS2_I(inode2); + + trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno, + (unsigned long long)oi2->ip_blkno); + + if (*bh1) + *bh1 = NULL; + if (*bh2) + *bh2 = NULL; + + /* We always want to lock the one with the lower lockid first. */ + if (oi1->ip_blkno > oi2->ip_blkno) + mlog_errno(-ENOLCK); + + /* lock id1 */ + status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_REFLINK_TARGET); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto out_rw2; + } + + /* lock id2 */ + if (!same_inode) { + status = ocfs2_inode_lock_nested(inode2, bh2, 1, + OI_LS_REFLINK_TARGET); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto out_cl1; + } + } else + *bh2 = *bh1; + + trace_ocfs2_double_lock_end( + (unsigned long long)OCFS2_I(inode1)->ip_blkno, + (unsigned long long)OCFS2_I(inode2)->ip_blkno); + + return 0; + +out_cl1: + ocfs2_inode_unlock(inode1, 1); + brelse(*bh1); + *bh1 = NULL; +out_rw2: + ocfs2_rw_unlock(inode2, 1); +out_i2: + ocfs2_rw_unlock(inode1, 1); +out_i1: + unlock_two_nondirectories(s_inode, t_inode); + return status; +} + +/* Unlock both inodes and release buffers. */ +static void ocfs2_reflink_inodes_unlock(struct inode *s_inode, + struct buffer_head *s_bh, + struct inode *t_inode, + struct buffer_head *t_bh) +{ + ocfs2_inode_unlock(s_inode, 1); + ocfs2_rw_unlock(s_inode, 1); + brelse(s_bh); + if (s_inode != t_inode) { + ocfs2_inode_unlock(t_inode, 1); + ocfs2_rw_unlock(t_inode, 1); + brelse(t_bh); + } + unlock_two_nondirectories(s_inode, t_inode); +} + +/* Link a range of blocks from one file to another. */ +int ocfs2_reflink_remap_range(struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + u64 len, + bool is_dedupe) +{ + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb); + struct buffer_head *in_bh = NULL, *out_bh = NULL; + bool same_inode = (inode_in == inode_out); + ssize_t ret; + + if (!ocfs2_refcount_tree(osb)) + return -EOPNOTSUPP; + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) + return -EROFS; + + /* Lock both files against IO */ + ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh); + if (ret) + return ret; + + /* Check file eligibility and prepare for block sharing. */ + ret = -EINVAL; + if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) || + (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE)) + goto out_unlock; + + ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out, + &len, is_dedupe); + if (ret || len == 0) + goto out_unlock; + + /* Lock out changes to the allocation maps and remap. */ + down_write(&OCFS2_I(inode_in)->ip_alloc_sem); + if (!same_inode) + down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem, + SINGLE_DEPTH_NESTING); + + ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out, + out_bh, pos_out, len); + + /* Zap any page cache for the destination file's range. */ + if (!ret) + truncate_inode_pages_range(&inode_out->i_data, pos_out, + PAGE_ALIGN(pos_out + len) - 1); + + up_write(&OCFS2_I(inode_in)->ip_alloc_sem); + if (!same_inode) + up_write(&OCFS2_I(inode_out)->ip_alloc_sem); + if (ret) { + mlog_errno(ret); + goto out_unlock; + } + + /* + * Empty the extent map so that we may get the right extent + * record from the disk. + */ + ocfs2_extent_map_trunc(inode_in, 0); + ocfs2_extent_map_trunc(inode_out, 0); + + ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len); + if (ret) { + mlog_errno(ret); + goto out_unlock; + } + + ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh); + return 0; + +out_unlock: + ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh); + return ret; +} diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 6422bbcdb525..4af55bf4b35b 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -115,4 +115,11 @@ int ocfs2_reflink_ioctl(struct inode *inode, const char __user *oldname, const char __user *newname, bool preserve); +int ocfs2_reflink_remap_range(struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + u64 len, + bool is_dedupe); + #endif /* OCFS2_REFCOUNTTREE_H */ diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index cb157a34a656..3c5384d9b3a5 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -2577,7 +2577,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) return 0; - if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) { + if (ocfs2_is_refcount_inode(inode)) { ret = ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), le64_to_cpu(di->i_refcount_loc), 1, &ref_tree, &ref_root_bh); @@ -3608,7 +3608,7 @@ int ocfs2_xattr_set(struct inode *inode, } /* Check whether the value is refcounted and do some preparation. */ - if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL && + if (ocfs2_is_refcount_inode(inode) && (!xis.not_found || !xbs.not_found)) { ret = ocfs2_prepare_refcount_xattr(inode, di, &xi, &xis, &xbs, &ref_tree, diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 1bfac28b7e7d..8738a0d62c09 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2401,7 +2401,7 @@ int dquot_resume(struct super_block *sb, int type) EXPORT_SYMBOL(dquot_resume); int dquot_quota_on(struct super_block *sb, int type, int format_id, - struct path *path) + const struct path *path) { int error = security_quota_on(path->dentry); if (error) diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 2d445425aad7..5acd0c4769af 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -80,7 +80,7 @@ unsigned int qtype_enforce_flag(int type) } static int quota_quotaon(struct super_block *sb, int type, qid_t id, - struct path *path) + const struct path *path) { if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_enable) return -ENOSYS; @@ -700,7 +700,7 @@ static int quota_rmxquota(struct super_block *sb, void __user *addr) /* Copy parameters and call proper function */ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, - void __user *addr, struct path *path) + void __user *addr, const struct path *path) { int ret; diff --git a/fs/read_write.c b/fs/read_write.c index 53bccd1c786e..da6de12b5c46 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1540,20 +1540,37 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, sb_start_write(inode_out->i_sb); - ret = -EOPNOTSUPP; - if (file_out->f_op->copy_file_range) + /* + * Try cloning first, this is supported by more file systems, and + * more efficient if both clone and copy are supported (e.g. NFS). + */ + if (file_in->f_op->clone_file_range) { + ret = file_in->f_op->clone_file_range(file_in, pos_in, + file_out, pos_out, len); + if (ret == 0) { + ret = len; + goto done; + } + } + + if (file_out->f_op->copy_file_range) { ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, pos_out, len, flags); - if (ret == -EOPNOTSUPP) - ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, - len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); + if (ret != -EOPNOTSUPP) + goto done; + } + + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); +done: if (ret > 0) { fsnotify_access(file_in); add_rchar(current, ret); fsnotify_modify(file_out); add_wchar(current, ret); } + inc_syscr(current); inc_syscw(current); @@ -1648,6 +1665,114 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) return security_file_permission(file, write ? MAY_WRITE : MAY_READ); } +/* + * Check that the two inodes are eligible for cloning, the ranges make + * sense, and then flush all dirty data. Caller must ensure that the + * inodes have been locked against any other modifications. + */ +int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, + struct inode *inode_out, loff_t pos_out, + u64 *len, bool is_dedupe) +{ + loff_t bs = inode_out->i_sb->s_blocksize; + loff_t blen; + loff_t isize; + bool same_inode = (inode_in == inode_out); + int ret; + + /* Don't touch certain kinds of inodes */ + if (IS_IMMUTABLE(inode_out)) + return -EPERM; + + if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) + return -ETXTBSY; + + /* Don't reflink dirs, pipes, sockets... */ + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) + return -EISDIR; + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) + return -EINVAL; + + /* Are we going all the way to the end? */ + isize = i_size_read(inode_in); + if (isize == 0) { + *len = 0; + return 0; + } + + /* Zero length dedupe exits immediately; reflink goes to EOF. */ + if (*len == 0) { + if (is_dedupe) { + *len = 0; + return 0; + } + *len = isize - pos_in; + } + + /* Ensure offsets don't wrap and the input is inside i_size */ + if (pos_in + *len < pos_in || pos_out + *len < pos_out || + pos_in + *len > isize) + return -EINVAL; + + /* Don't allow dedupe past EOF in the dest file */ + if (is_dedupe) { + loff_t disize; + + disize = i_size_read(inode_out); + if (pos_out >= disize || pos_out + *len > disize) + return -EINVAL; + } + + /* If we're linking to EOF, continue to the block boundary. */ + if (pos_in + *len == isize) + blen = ALIGN(isize, bs) - pos_in; + else + blen = *len; + + /* Only reflink if we're aligned to block boundaries */ + if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) || + !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs)) + return -EINVAL; + + /* Don't allow overlapped reflink within the same file */ + if (same_inode) { + if (pos_out + blen > pos_in && pos_out < pos_in + blen) + return -EINVAL; + } + + /* Wait for the completion of any pending IOs on both files */ + inode_dio_wait(inode_in); + if (!same_inode) + inode_dio_wait(inode_out); + + ret = filemap_write_and_wait_range(inode_in->i_mapping, + pos_in, pos_in + *len - 1); + if (ret) + return ret; + + ret = filemap_write_and_wait_range(inode_out->i_mapping, + pos_out, pos_out + *len - 1); + if (ret) + return ret; + + /* + * Check that the extents are the same. + */ + if (is_dedupe) { + bool is_same = false; + + ret = vfs_dedupe_file_range_compare(inode_in, pos_in, + inode_out, pos_out, *len, &is_same); + if (ret) + return ret; + if (!is_same) + return -EBADE; + } + + return 0; +} +EXPORT_SYMBOL(vfs_clone_file_prep_inodes); + int vfs_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, u64 len) { @@ -1698,6 +1823,102 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in, } EXPORT_SYMBOL(vfs_clone_file_range); +/* + * Read a page's worth of file data into the page cache. Return the page + * locked. + */ +static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset) +{ + struct address_space *mapping; + struct page *page; + pgoff_t n; + + n = offset >> PAGE_SHIFT; + mapping = inode->i_mapping; + page = read_mapping_page(mapping, n, NULL); + if (IS_ERR(page)) + return page; + if (!PageUptodate(page)) { + put_page(page); + return ERR_PTR(-EIO); + } + lock_page(page); + return page; +} + +/* + * Compare extents of two files to see if they are the same. + * Caller must have locked both inodes to prevent write races. + */ +int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, + struct inode *dest, loff_t destoff, + loff_t len, bool *is_same) +{ + loff_t src_poff; + loff_t dest_poff; + void *src_addr; + void *dest_addr; + struct page *src_page; + struct page *dest_page; + loff_t cmp_len; + bool same; + int error; + + error = -EINVAL; + same = true; + while (len) { + src_poff = srcoff & (PAGE_SIZE - 1); + dest_poff = destoff & (PAGE_SIZE - 1); + cmp_len = min(PAGE_SIZE - src_poff, + PAGE_SIZE - dest_poff); + cmp_len = min(cmp_len, len); + if (cmp_len <= 0) + goto out_error; + + src_page = vfs_dedupe_get_page(src, srcoff); + if (IS_ERR(src_page)) { + error = PTR_ERR(src_page); + goto out_error; + } + dest_page = vfs_dedupe_get_page(dest, destoff); + if (IS_ERR(dest_page)) { + error = PTR_ERR(dest_page); + unlock_page(src_page); + put_page(src_page); + goto out_error; + } + src_addr = kmap_atomic(src_page); + dest_addr = kmap_atomic(dest_page); + + flush_dcache_page(src_page); + flush_dcache_page(dest_page); + + if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) + same = false; + + kunmap_atomic(dest_addr); + kunmap_atomic(src_addr); + unlock_page(dest_page); + unlock_page(src_page); + put_page(dest_page); + put_page(src_page); + + if (!same) + break; + + srcoff += cmp_len; + destoff += cmp_len; + len -= cmp_len; + } + + *is_same = same; + return 0; + +out_error: + return error; +} +EXPORT_SYMBOL(vfs_dedupe_file_range_compare); + int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) { struct file_dedupe_range_info *info; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 0a6ad4e71e88..e314cb30a181 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -802,7 +802,7 @@ static int reiserfs_acquire_dquot(struct dquot *); static int reiserfs_release_dquot(struct dquot *); static int reiserfs_mark_dquot_dirty(struct dquot *); static int reiserfs_write_info(struct super_block *, int); -static int reiserfs_quota_on(struct super_block *, int, int, struct path *); +static int reiserfs_quota_on(struct super_block *, int, int, const struct path *); static const struct dquot_operations reiserfs_quota_operations = { .write_dquot = reiserfs_write_dquot, @@ -2348,7 +2348,7 @@ static int reiserfs_quota_on_mount(struct super_block *sb, int type) * Standard function to be called on quota_on */ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, - struct path *path) + const struct path *path) { int err; struct inode *inode; diff --git a/fs/statfs.c b/fs/statfs.c index 083dc0ac9140..13ae259d4879 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -63,7 +63,7 @@ static int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf) return retval; } -int vfs_statfs(struct path *path, struct kstatfs *buf) +int vfs_statfs(const struct path *path, struct kstatfs *buf) { int error; diff --git a/fs/utimes.c b/fs/utimes.c index 22307cdf7014..5fdb505e307c 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -48,7 +48,7 @@ static bool nsec_valid(long nsec) return nsec >= 0 && nsec <= 999999999; } -static int utimes_common(struct path *path, struct timespec *times) +static int utimes_common(const struct path *path, struct timespec *times) { int error; struct iattr newattrs; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 65d27a502909..bbb9eb6811b2 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -848,24 +848,6 @@ out_unlock: return error; } -STATIC ssize_t -xfs_file_copy_range( - struct file *file_in, - loff_t pos_in, - struct file *file_out, - loff_t pos_out, - size_t len, - unsigned int flags) -{ - int error; - - error = xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, - len, false); - if (error) - return error; - return len; -} - STATIC int xfs_file_clone_range( struct file *file_in, @@ -1549,7 +1531,6 @@ const struct file_operations xfs_file_operations = { .fsync = xfs_file_fsync, .get_unmapped_area = thp_get_unmapped_area, .fallocate = xfs_file_fallocate, - .copy_file_range = xfs_file_copy_range, .clone_file_range = xfs_file_clone_range, .dedupe_file_range = xfs_file_dedupe_range, }; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 88fd03c66e99..aca2d4bd4303 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1114,111 +1114,6 @@ err: } /* - * Read a page's worth of file data into the page cache. Return the page - * locked. - */ -static struct page * -xfs_get_page( - struct inode *inode, - xfs_off_t offset) -{ - struct address_space *mapping; - struct page *page; - pgoff_t n; - - n = offset >> PAGE_SHIFT; - mapping = inode->i_mapping; - page = read_mapping_page(mapping, n, NULL); - if (IS_ERR(page)) - return page; - if (!PageUptodate(page)) { - put_page(page); - return ERR_PTR(-EIO); - } - lock_page(page); - return page; -} - -/* - * Compare extents of two files to see if they are the same. - */ -static int -xfs_compare_extents( - struct inode *src, - xfs_off_t srcoff, - struct inode *dest, - xfs_off_t destoff, - xfs_off_t len, - bool *is_same) -{ - xfs_off_t src_poff; - xfs_off_t dest_poff; - void *src_addr; - void *dest_addr; - struct page *src_page; - struct page *dest_page; - xfs_off_t cmp_len; - bool same; - int error; - - error = -EINVAL; - same = true; - while (len) { - src_poff = srcoff & (PAGE_SIZE - 1); - dest_poff = destoff & (PAGE_SIZE - 1); - cmp_len = min(PAGE_SIZE - src_poff, - PAGE_SIZE - dest_poff); - cmp_len = min(cmp_len, len); - ASSERT(cmp_len > 0); - - trace_xfs_reflink_compare_extents(XFS_I(src), srcoff, cmp_len, - XFS_I(dest), destoff); - - src_page = xfs_get_page(src, srcoff); - if (IS_ERR(src_page)) { - error = PTR_ERR(src_page); - goto out_error; - } - dest_page = xfs_get_page(dest, destoff); - if (IS_ERR(dest_page)) { - error = PTR_ERR(dest_page); - unlock_page(src_page); - put_page(src_page); - goto out_error; - } - src_addr = kmap_atomic(src_page); - dest_addr = kmap_atomic(dest_page); - - flush_dcache_page(src_page); - flush_dcache_page(dest_page); - - if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) - same = false; - - kunmap_atomic(dest_addr); - kunmap_atomic(src_addr); - unlock_page(dest_page); - unlock_page(src_page); - put_page(dest_page); - put_page(src_page); - - if (!same) - break; - - srcoff += cmp_len; - destoff += cmp_len; - len -= cmp_len; - } - - *is_same = same; - return 0; - -out_error: - trace_xfs_reflink_compare_extents_error(XFS_I(dest), error, _RET_IP_); - return error; -} - -/* * Link a range of blocks from one file to another. */ int @@ -1235,14 +1130,11 @@ xfs_reflink_remap_range( struct inode *inode_out = file_inode(file_out); struct xfs_inode *dest = XFS_I(inode_out); struct xfs_mount *mp = src->i_mount; - loff_t bs = inode_out->i_sb->s_blocksize; bool same_inode = (inode_in == inode_out); xfs_fileoff_t sfsbno, dfsbno; xfs_filblks_t fsblen; xfs_extlen_t cowextsize; - loff_t isize; ssize_t ret; - loff_t blen; if (!xfs_sb_version_hasreflink(&mp->m_sb)) return -EOPNOTSUPP; @@ -1257,26 +1149,8 @@ xfs_reflink_remap_range( else xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL); - /* Don't touch certain kinds of inodes */ - ret = -EPERM; - if (IS_IMMUTABLE(inode_out)) - goto out_unlock; - - ret = -ETXTBSY; - if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) - goto out_unlock; - - - /* Don't reflink dirs, pipes, sockets... */ - ret = -EISDIR; - if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) - goto out_unlock; + /* Check file eligibility and prepare for block sharing. */ ret = -EINVAL; - if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode)) - goto out_unlock; - if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) - goto out_unlock; - /* Don't reflink realtime inodes */ if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest)) goto out_unlock; @@ -1285,97 +1159,18 @@ xfs_reflink_remap_range( if (IS_DAX(inode_in) || IS_DAX(inode_out)) goto out_unlock; - /* Are we going all the way to the end? */ - isize = i_size_read(inode_in); - if (isize == 0) { - ret = 0; - goto out_unlock; - } - - /* Zero length dedupe exits immediately; reflink goes to EOF. */ - if (len == 0) { - if (is_dedupe) { - ret = 0; - goto out_unlock; - } - len = isize - pos_in; - } - - /* Ensure offsets don't wrap and the input is inside i_size */ - if (pos_in + len < pos_in || pos_out + len < pos_out || - pos_in + len > isize) - goto out_unlock; - - /* Don't allow dedupe past EOF in the dest file */ - if (is_dedupe) { - loff_t disize; - - disize = i_size_read(inode_out); - if (pos_out >= disize || pos_out + len > disize) - goto out_unlock; - } - - /* If we're linking to EOF, continue to the block boundary. */ - if (pos_in + len == isize) - blen = ALIGN(isize, bs) - pos_in; - else - blen = len; - - /* Only reflink if we're aligned to block boundaries */ - if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) || - !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs)) - goto out_unlock; - - /* Don't allow overlapped reflink within the same file */ - if (same_inode) { - if (pos_out + blen > pos_in && pos_out < pos_in + blen) - goto out_unlock; - } - - /* Wait for the completion of any pending IOs on both files */ - inode_dio_wait(inode_in); - if (!same_inode) - inode_dio_wait(inode_out); - - ret = filemap_write_and_wait_range(inode_in->i_mapping, - pos_in, pos_in + len - 1); - if (ret) - goto out_unlock; - - ret = filemap_write_and_wait_range(inode_out->i_mapping, - pos_out, pos_out + len - 1); - if (ret) + ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out, + &len, is_dedupe); + if (ret || len == 0) goto out_unlock; trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); - /* - * Check that the extents are the same. - */ - if (is_dedupe) { - bool is_same = false; - - ret = xfs_compare_extents(inode_in, pos_in, inode_out, pos_out, - len, &is_same); - if (ret) - goto out_unlock; - if (!is_same) { - ret = -EBADE; - goto out_unlock; - } - } - + /* Set flags and remap blocks. */ ret = xfs_reflink_set_inode_flag(src, dest); if (ret) goto out_unlock; - /* - * Invalidate the page cache so that we can clear any CoW mappings - * in the destination file. - */ - truncate_inode_pages_range(&inode_out->i_data, pos_out, - PAGE_ALIGN(pos_out + len) - 1); - dfsbno = XFS_B_TO_FSBT(mp, pos_out); sfsbno = XFS_B_TO_FSBT(mp, pos_in); fsblen = XFS_B_TO_FSB(mp, len); @@ -1384,6 +1179,10 @@ xfs_reflink_remap_range( if (ret) goto out_unlock; + /* Zap any page cache for the destination file's range. */ + truncate_inode_pages_range(&inode_out->i_data, pos_out, + PAGE_ALIGN(pos_out + len) - 1); + /* * Carry the cowextsize hint from src to dest if we're sharing the * entire source file to the entire destination file, the source file |