diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-08-07 20:03:24 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-08-07 20:03:24 +0300 |
commit | eb5699ba31558bdb2cee6ebde3d0a68091e47dce (patch) | |
tree | 3aeab3158f7ae43431405f3aa4f2e1fa3d103206 /fs | |
parent | b5a8466d37d30cfcc8015789f4a3f0c44b6c7bc6 (diff) | |
parent | b99695580bfc1f91364023c673681ddb88e375dc (diff) | |
download | linux-eb5699ba31558bdb2cee6ebde3d0a68091e47dce.tar.xz |
Merge tag 'mm-nonmm-stable-2022-08-06-2' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull misc updates from Andrew Morton:
"Updates to various subsystems which I help look after. lib, ocfs2,
fatfs, autofs, squashfs, procfs, etc. A relatively small amount of
material this time"
* tag 'mm-nonmm-stable-2022-08-06-2' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (72 commits)
scripts/gdb: ensure the absolute path is generated on initial source
MAINTAINERS: kunit: add David Gow as a maintainer of KUnit
mailmap: add linux.dev alias for Brendan Higgins
mailmap: update Kirill's email
profile: setup_profiling_timer() is moslty not implemented
ocfs2: fix a typo in a comment
ocfs2: use the bitmap API to simplify code
ocfs2: remove some useless functions
lib/mpi: fix typo 'the the' in comment
proc: add some (hopefully) insightful comments
bdi: remove enum wb_congested_state
kernel/hung_task: fix address space of proc_dohung_task_timeout_secs
lib/lzo/lzo1x_compress.c: replace ternary operator with min() and min_t()
squashfs: support reading fragments in readahead call
squashfs: implement readahead
squashfs: always build "file direct" version of page actor
Revert "squashfs: provide backing_dev_info in order to disable read-ahead"
fs/ocfs2: Fix spelling typo in comment
ia64: old_rr4 added under CONFIG_HUGETLB_PAGE
proc: fix test for "vsyscall=xonly" boot option
...
Diffstat (limited to 'fs')
32 files changed, 636 insertions, 340 deletions
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 918826eaceea..d5a44fa88acf 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -51,8 +51,6 @@ extern struct file_system_type autofs_fs_type; */ struct autofs_info { struct dentry *dentry; - struct inode *inode; - int flags; struct completion expire_complete; @@ -148,6 +146,11 @@ static inline int autofs_oz_mode(struct autofs_sb_info *sbi) task_pgrp(current) == sbi->oz_pgrp); } +static inline bool autofs_empty(struct autofs_info *ino) +{ + return ino->count < 2; +} + struct inode *autofs_get_inode(struct super_block *, umode_t); void autofs_free_ino(struct autofs_info *); diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index b3fefd6237c3..038b3d2d9f57 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -371,7 +371,7 @@ static struct dentry *should_expire(struct dentry *dentry, return NULL; } - if (simple_empty(dentry)) + if (autofs_empty(ino)) return NULL; /* Case 2: tree mount, expire iff entire tree is not busy */ diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index 9edf243713eb..affa70360b1f 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -20,6 +20,7 @@ struct autofs_info *autofs_new_ino(struct autofs_sb_info *sbi) INIT_LIST_HEAD(&ino->expiring); ino->last_used = jiffies; ino->sbi = sbi; + ino->count = 1; } return ino; } diff --git a/fs/autofs/root.c b/fs/autofs/root.c index 91fe4548c256..ca03c1cae2be 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -10,6 +10,7 @@ #include "autofs_i.h" +static int autofs_dir_permission(struct user_namespace *, struct inode *, int); static int autofs_dir_symlink(struct user_namespace *, struct inode *, struct dentry *, const char *); static int autofs_dir_unlink(struct inode *, struct dentry *); @@ -50,6 +51,7 @@ const struct file_operations autofs_dir_operations = { const struct inode_operations autofs_dir_inode_operations = { .lookup = autofs_lookup, + .permission = autofs_dir_permission, .unlink = autofs_dir_unlink, .symlink = autofs_dir_symlink, .mkdir = autofs_dir_mkdir, @@ -77,6 +79,7 @@ static int autofs_dir_open(struct inode *inode, struct file *file) { struct dentry *dentry = file->f_path.dentry; struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); pr_debug("file=%p dentry=%p %pd\n", file, dentry, dentry); @@ -93,7 +96,7 @@ static int autofs_dir_open(struct inode *inode, struct file *file) * it. */ spin_lock(&sbi->lookup_lock); - if (!path_is_mountpoint(&file->f_path) && simple_empty(dentry)) { + if (!path_is_mountpoint(&file->f_path) && autofs_empty(ino)) { spin_unlock(&sbi->lookup_lock); return -ENOENT; } @@ -288,9 +291,26 @@ static struct dentry *autofs_mountpoint_changed(struct path *path) struct dentry *dentry = path->dentry; struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); - /* - * If this is an indirect mount the dentry could have gone away - * as a result of an expire and a new one created. + /* If this is an indirect mount the dentry could have gone away + * and a new one created. + * + * This is unusual and I can't remember the case for which it + * was originally added now. But an example of how this can + * happen is an autofs indirect mount that has the "browse" + * option set and also has the "symlink" option in the autofs + * map entry. In this case the daemon will remove the browse + * directory and create a symlink as the mount leaving the + * struct path stale. + * + * Another not so obvious case is when a mount in an autofs + * indirect mount that uses the "nobrowse" option is being + * expired at the same time as a path walk. If the mount has + * been umounted but the mount point directory seen before + * becoming unhashed (during a lockless path walk) when a stat + * family system call is made the mount won't be re-mounted as + * it should. In this case the mount point that's been removed + * (by the daemon) will be stale and the a new mount point + * dentry created. */ if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) { struct dentry *parent = dentry->d_parent; @@ -362,7 +382,7 @@ static struct vfsmount *autofs_d_automount(struct path *path) * the mount never trigger mounts themselves (they have an * autofs trigger mount mounted on them). But v4 pseudo direct * mounts do need the leaves to trigger mounts. In this case - * we have no choice but to use the list_empty() check and + * we have no choice but to use the autofs_empty() check and * require user space behave. */ if (sbi->version > 4) { @@ -371,7 +391,7 @@ static struct vfsmount *autofs_d_automount(struct path *path) goto done; } } else { - if (!simple_empty(dentry)) { + if (!autofs_empty(ino)) { spin_unlock(&sbi->fs_lock); goto done; } @@ -426,9 +446,8 @@ static int autofs_d_manage(const struct path *path, bool rcu_walk) if (rcu_walk) { /* We don't need fs_lock in rcu_walk mode, - * just testing 'AUTOFS_INFO_NO_RCU' is enough. - * simple_empty() takes a spinlock, so leave it - * to last. + * just testing 'AUTOFS_INF_WANT_EXPIRE' is enough. + * * We only return -EISDIR when certain this isn't * a mount-trap. */ @@ -441,9 +460,7 @@ static int autofs_d_manage(const struct path *path, bool rcu_walk) inode = d_inode_rcu(dentry); if (inode && S_ISLNK(inode->i_mode)) return -EISDIR; - if (list_empty(&dentry->d_subdirs)) - return 0; - if (!simple_empty(dentry)) + if (!autofs_empty(ino)) return -EISDIR; return 0; } @@ -463,7 +480,7 @@ static int autofs_d_manage(const struct path *path, bool rcu_walk) * we can avoid needless calls ->d_automount() and avoid * an incorrect ELOOP error return. */ - if ((!path_is_mountpoint(path) && !simple_empty(dentry)) || + if ((!path_is_mountpoint(path) && !autofs_empty(ino)) || (d_really_is_positive(dentry) && d_is_symlink(dentry))) status = -EISDIR; } @@ -526,11 +543,30 @@ static struct dentry *autofs_lookup(struct inode *dir, return NULL; } +static int autofs_dir_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) +{ + if (mask & MAY_WRITE) { + struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb); + + if (!autofs_oz_mode(sbi)) + return -EACCES; + + /* autofs_oz_mode() needs to allow path walks when the + * autofs mount is catatonic but the state of an autofs + * file system needs to be preserved over restarts. + */ + if (sbi->flags & AUTOFS_SBI_CATATONIC) + return -EACCES; + } + + return generic_permission(mnt_userns, inode, mask); +} + static int autofs_dir_symlink(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, const char *symname) { - struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); struct autofs_info *ino = autofs_dentry_ino(dentry); struct autofs_info *p_ino; struct inode *inode; @@ -539,16 +575,6 @@ static int autofs_dir_symlink(struct user_namespace *mnt_userns, pr_debug("%s <- %pd\n", symname, dentry); - if (!autofs_oz_mode(sbi)) - return -EACCES; - - /* autofs_oz_mode() needs to allow path walks when the - * autofs mount is catatonic but the state of an autofs - * file system needs to be preserved over restarts. - */ - if (sbi->flags & AUTOFS_SBI_CATATONIC) - return -EACCES; - BUG_ON(!ino); autofs_clean_ino(ino); @@ -571,7 +597,6 @@ static int autofs_dir_symlink(struct user_namespace *mnt_userns, d_add(dentry, inode); dget(dentry); - ino->count++; p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count++; @@ -601,17 +626,6 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry) struct autofs_info *ino = autofs_dentry_ino(dentry); struct autofs_info *p_ino; - if (!autofs_oz_mode(sbi)) - return -EACCES; - - /* autofs_oz_mode() needs to allow path walks when the - * autofs mount is catatonic but the state of an autofs - * file system needs to be preserved over restarts. - */ - if (sbi->flags & AUTOFS_SBI_CATATONIC) - return -EACCES; - - ino->count--; p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count--; dput(ino->dentry); @@ -683,16 +697,6 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry) pr_debug("dentry %p, removing %pd\n", dentry, dentry); - if (!autofs_oz_mode(sbi)) - return -EACCES; - - /* autofs_oz_mode() needs to allow path walks when the - * autofs mount is catatonic but the state of an autofs - * file system needs to be preserved over restarts. - */ - if (sbi->flags & AUTOFS_SBI_CATATONIC) - return -EACCES; - if (ino->count != 1) return -ENOTEMPTY; @@ -704,7 +708,6 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry) if (sbi->version < 5) autofs_clear_leaf_automount_flags(dentry); - ino->count--; p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count--; dput(ino->dentry); @@ -726,16 +729,6 @@ static int autofs_dir_mkdir(struct user_namespace *mnt_userns, struct autofs_info *p_ino; struct inode *inode; - if (!autofs_oz_mode(sbi)) - return -EACCES; - - /* autofs_oz_mode() needs to allow path walks when the - * autofs mount is catatonic but the state of an autofs - * file system needs to be preserved over restarts. - */ - if (sbi->flags & AUTOFS_SBI_CATATONIC) - return -EACCES; - pr_debug("dentry %p, creating %pd\n", dentry, dentry); BUG_ON(!ino); @@ -753,7 +746,6 @@ static int autofs_dir_mkdir(struct user_namespace *mnt_userns, autofs_set_leaf_automount_flags(dentry); dget(dentry); - ino->count++; p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count++; inc_nlink(dir); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index e2daa940ebce..8b56b94e2f56 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1747,6 +1747,21 @@ static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms) return to; } +/* + * autoremove_wake_function, but remove even on failure to wake up, because we + * know that default_wake_function/ttwu will only fail if the thread is already + * woken, and in that case the ep_poll loop will remove the entry anyways, not + * try to reuse it. + */ +static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, + unsigned int mode, int sync, void *key) +{ + int ret = default_wake_function(wq_entry, mode, sync, key); + + list_del_init(&wq_entry->entry); + return ret; +} + /** * ep_poll - Retrieves ready events, and delivers them to the caller-supplied * event buffer. @@ -1828,8 +1843,15 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, * normal wakeup path no need to call __remove_wait_queue() * explicitly, thus ep->lock is not taken, which halts the * event delivery. + * + * In fact, we now use an even more aggressive function that + * unconditionally removes, because we don't reuse the wait + * entry between loop iterations. This lets us also avoid the + * performance issue if a process is killed, causing all of its + * threads to wake up without being removed normally. */ init_wait(&wait); + wait.func = ep_autoremove_wake_function; write_lock_irq(&ep->lock); /* diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index c573314806cf..21620054e1c4 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -889,22 +889,57 @@ out: return err; } -static int vfat_rename(struct user_namespace *mnt_userns, struct inode *old_dir, - struct dentry *old_dentry, struct inode *new_dir, - struct dentry *new_dentry, unsigned int flags) +static int vfat_get_dotdot_de(struct inode *inode, struct buffer_head **bh, + struct msdos_dir_entry **de) +{ + if (S_ISDIR(inode->i_mode)) { + if (fat_get_dotdot_entry(inode, bh, de)) + return -EIO; + } + return 0; +} + +static int vfat_sync_ipos(struct inode *dir, struct inode *inode) +{ + if (IS_DIRSYNC(dir)) + return fat_sync_inode(inode); + mark_inode_dirty(inode); + return 0; +} + +static int vfat_update_dotdot_de(struct inode *dir, struct inode *inode, + struct buffer_head *dotdot_bh, + struct msdos_dir_entry *dotdot_de) +{ + fat_set_start(dotdot_de, MSDOS_I(dir)->i_logstart); + mark_buffer_dirty_inode(dotdot_bh, inode); + if (IS_DIRSYNC(dir)) + return sync_dirty_buffer(dotdot_bh); + return 0; +} + +static void vfat_update_dir_metadata(struct inode *dir, struct timespec64 *ts) +{ + inode_inc_iversion(dir); + fat_truncate_time(dir, ts, S_CTIME | S_MTIME); + if (IS_DIRSYNC(dir)) + (void)fat_sync_inode(dir); + else + mark_inode_dirty(dir); +} + +static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) { struct buffer_head *dotdot_bh; - struct msdos_dir_entry *dotdot_de; + struct msdos_dir_entry *dotdot_de = NULL; struct inode *old_inode, *new_inode; struct fat_slot_info old_sinfo, sinfo; struct timespec64 ts; loff_t new_i_pos; - int err, is_dir, update_dotdot, corrupt = 0; + int err, is_dir, corrupt = 0; struct super_block *sb = old_dir->i_sb; - if (flags & ~RENAME_NOREPLACE) - return -EINVAL; - old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; old_inode = d_inode(old_dentry); new_inode = d_inode(new_dentry); @@ -913,15 +948,13 @@ static int vfat_rename(struct user_namespace *mnt_userns, struct inode *old_dir, if (err) goto out; - is_dir = S_ISDIR(old_inode->i_mode); - update_dotdot = (is_dir && old_dir != new_dir); - if (update_dotdot) { - if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de)) { - err = -EIO; + if (old_dir != new_dir) { + err = vfat_get_dotdot_de(old_inode, &dotdot_bh, &dotdot_de); + if (err) goto out; - } } + is_dir = S_ISDIR(old_inode->i_mode); ts = current_time(old_dir); if (new_inode) { if (is_dir) { @@ -942,21 +975,15 @@ static int vfat_rename(struct user_namespace *mnt_userns, struct inode *old_dir, fat_detach(old_inode); fat_attach(old_inode, new_i_pos); - if (IS_DIRSYNC(new_dir)) { - err = fat_sync_inode(old_inode); - if (err) - goto error_inode; - } else - mark_inode_dirty(old_inode); + err = vfat_sync_ipos(new_dir, old_inode); + if (err) + goto error_inode; - if (update_dotdot) { - fat_set_start(dotdot_de, MSDOS_I(new_dir)->i_logstart); - mark_buffer_dirty_inode(dotdot_bh, old_inode); - if (IS_DIRSYNC(new_dir)) { - err = sync_dirty_buffer(dotdot_bh); - if (err) - goto error_dotdot; - } + if (dotdot_de) { + err = vfat_update_dotdot_de(new_dir, old_inode, dotdot_bh, + dotdot_de); + if (err) + goto error_dotdot; drop_nlink(old_dir); if (!new_inode) inc_nlink(new_dir); @@ -966,12 +993,7 @@ static int vfat_rename(struct user_namespace *mnt_userns, struct inode *old_dir, old_sinfo.bh = NULL; if (err) goto error_dotdot; - inode_inc_iversion(old_dir); - fat_truncate_time(old_dir, &ts, S_CTIME|S_MTIME); - if (IS_DIRSYNC(old_dir)) - (void)fat_sync_inode(old_dir); - else - mark_inode_dirty(old_dir); + vfat_update_dir_metadata(old_dir, &ts); if (new_inode) { drop_nlink(new_inode); @@ -991,10 +1013,9 @@ error_dotdot: /* data cluster is shared, serious corruption */ corrupt = 1; - if (update_dotdot) { - fat_set_start(dotdot_de, MSDOS_I(old_dir)->i_logstart); - mark_buffer_dirty_inode(dotdot_bh, old_inode); - corrupt |= sync_dirty_buffer(dotdot_bh); + if (dotdot_de) { + corrupt |= vfat_update_dotdot_de(old_dir, old_inode, dotdot_bh, + dotdot_de); } error_inode: fat_detach(old_inode); @@ -1021,13 +1042,145 @@ error_inode: goto out; } +static void vfat_exchange_ipos(struct inode *old_inode, struct inode *new_inode, + loff_t old_i_pos, loff_t new_i_pos) +{ + fat_detach(old_inode); + fat_detach(new_inode); + fat_attach(old_inode, new_i_pos); + fat_attach(new_inode, old_i_pos); +} + +static void vfat_move_nlink(struct inode *src, struct inode *dst) +{ + drop_nlink(src); + inc_nlink(dst); +} + +static int vfat_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct buffer_head *old_dotdot_bh = NULL, *new_dotdot_bh = NULL; + struct msdos_dir_entry *old_dotdot_de = NULL, *new_dotdot_de = NULL; + struct inode *old_inode, *new_inode; + struct timespec64 ts = current_time(old_dir); + loff_t old_i_pos, new_i_pos; + int err, corrupt = 0; + struct super_block *sb = old_dir->i_sb; + + old_inode = d_inode(old_dentry); + new_inode = d_inode(new_dentry); + + /* Acquire super block lock for the operation to be atomic */ + mutex_lock(&MSDOS_SB(sb)->s_lock); + + /* if directories are not the same, get ".." info to update */ + if (old_dir != new_dir) { + err = vfat_get_dotdot_de(old_inode, &old_dotdot_bh, + &old_dotdot_de); + if (err) + goto out; + + err = vfat_get_dotdot_de(new_inode, &new_dotdot_bh, + &new_dotdot_de); + if (err) + goto out; + } + + old_i_pos = MSDOS_I(old_inode)->i_pos; + new_i_pos = MSDOS_I(new_inode)->i_pos; + + vfat_exchange_ipos(old_inode, new_inode, old_i_pos, new_i_pos); + + err = vfat_sync_ipos(old_dir, new_inode); + if (err) + goto error_exchange; + err = vfat_sync_ipos(new_dir, old_inode); + if (err) + goto error_exchange; + + /* update ".." directory entry info */ + if (old_dotdot_de) { + err = vfat_update_dotdot_de(new_dir, old_inode, old_dotdot_bh, + old_dotdot_de); + if (err) + goto error_old_dotdot; + } + if (new_dotdot_de) { + err = vfat_update_dotdot_de(old_dir, new_inode, new_dotdot_bh, + new_dotdot_de); + if (err) + goto error_new_dotdot; + } + + /* if cross directory and only one is a directory, adjust nlink */ + if (!old_dotdot_de != !new_dotdot_de) { + if (old_dotdot_de) + vfat_move_nlink(old_dir, new_dir); + else + vfat_move_nlink(new_dir, old_dir); + } + + vfat_update_dir_metadata(old_dir, &ts); + /* if directories are not the same, update new_dir as well */ + if (old_dir != new_dir) + vfat_update_dir_metadata(new_dir, &ts); + +out: + brelse(old_dotdot_bh); + brelse(new_dotdot_bh); + mutex_unlock(&MSDOS_SB(sb)->s_lock); + + return err; + +error_new_dotdot: + if (new_dotdot_de) { + corrupt |= vfat_update_dotdot_de(new_dir, new_inode, + new_dotdot_bh, new_dotdot_de); + } + +error_old_dotdot: + if (old_dotdot_de) { + corrupt |= vfat_update_dotdot_de(old_dir, old_inode, + old_dotdot_bh, old_dotdot_de); + } + +error_exchange: + vfat_exchange_ipos(old_inode, new_inode, new_i_pos, old_i_pos); + corrupt |= vfat_sync_ipos(new_dir, new_inode); + corrupt |= vfat_sync_ipos(old_dir, old_inode); + + if (corrupt < 0) { + fat_fs_error(new_dir->i_sb, + "%s: Filesystem corrupted (i_pos %lld, %lld)", + __func__, old_i_pos, new_i_pos); + } + goto out; +} + +static int vfat_rename2(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) +{ + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + if (flags & RENAME_EXCHANGE) { + return vfat_rename_exchange(old_dir, old_dentry, + new_dir, new_dentry); + } + + /* VFS already handled RENAME_NOREPLACE, handle it as a normal rename */ + return vfat_rename(old_dir, old_dentry, new_dir, new_dentry); +} + static const struct inode_operations vfat_dir_inode_operations = { .create = vfat_create, .lookup = vfat_lookup, .unlink = vfat_unlink, .mkdir = vfat_mkdir, .rmdir = vfat_rmdir, - .rename = vfat_rename, + .rename = vfat_rename2, .setattr = fat_setattr, .getattr = fat_getattr, .update_time = fat_update_time, diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c index 1b07550485b9..5d826274570c 100644 --- a/fs/kernel_read_file.c +++ b/fs/kernel_read_file.c @@ -29,15 +29,15 @@ * change between calls to kernel_read_file(). * * Returns number of bytes read (no single read will be bigger - * than INT_MAX), or negative on error. + * than SSIZE_MAX), or negative on error. * */ -int kernel_read_file(struct file *file, loff_t offset, void **buf, - size_t buf_size, size_t *file_size, - enum kernel_read_file_id id) +ssize_t kernel_read_file(struct file *file, loff_t offset, void **buf, + size_t buf_size, size_t *file_size, + enum kernel_read_file_id id) { loff_t i_size, pos; - size_t copied; + ssize_t copied; void *allocated = NULL; bool whole_file; int ret; @@ -58,7 +58,7 @@ int kernel_read_file(struct file *file, loff_t offset, void **buf, goto out; } /* The file is too big for sane activities. */ - if (i_size > INT_MAX) { + if (i_size > SSIZE_MAX) { ret = -EFBIG; goto out; } @@ -124,12 +124,12 @@ out: } EXPORT_SYMBOL_GPL(kernel_read_file); -int kernel_read_file_from_path(const char *path, loff_t offset, void **buf, - size_t buf_size, size_t *file_size, - enum kernel_read_file_id id) +ssize_t kernel_read_file_from_path(const char *path, loff_t offset, void **buf, + size_t buf_size, size_t *file_size, + enum kernel_read_file_id id) { struct file *file; - int ret; + ssize_t ret; if (!path || !*path) return -EINVAL; @@ -144,14 +144,14 @@ int kernel_read_file_from_path(const char *path, loff_t offset, void **buf, } EXPORT_SYMBOL_GPL(kernel_read_file_from_path); -int kernel_read_file_from_path_initns(const char *path, loff_t offset, - void **buf, size_t buf_size, - size_t *file_size, - enum kernel_read_file_id id) +ssize_t kernel_read_file_from_path_initns(const char *path, loff_t offset, + void **buf, size_t buf_size, + size_t *file_size, + enum kernel_read_file_id id) { struct file *file; struct path root; - int ret; + ssize_t ret; if (!path || !*path) return -EINVAL; @@ -171,12 +171,12 @@ int kernel_read_file_from_path_initns(const char *path, loff_t offset, } EXPORT_SYMBOL_GPL(kernel_read_file_from_path_initns); -int kernel_read_file_from_fd(int fd, loff_t offset, void **buf, - size_t buf_size, size_t *file_size, - enum kernel_read_file_id id) +ssize_t kernel_read_file_from_fd(int fd, loff_t offset, void **buf, + size_t buf_size, size_t *file_size, + enum kernel_read_file_id id) { struct fd f = fdget(fd); - int ret = -EBADF; + ssize_t ret = -EBADF; if (!f.file || !(f.file->f_mode & FMODE_READ)) goto out; diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index e360543ad7e7..8b2020f92b5f 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -296,17 +296,25 @@ static void dlmfs_evict_inode(struct inode *inode) { int status; struct dlmfs_inode_private *ip; + struct user_lock_res *lockres; + int teardown; clear_inode(inode); mlog(0, "inode %lu\n", inode->i_ino); ip = DLMFS_I(inode); + lockres = &ip->ip_lockres; if (S_ISREG(inode->i_mode)) { - status = user_dlm_destroy_lock(&ip->ip_lockres); - if (status < 0) - mlog_errno(status); + spin_lock(&lockres->l_lock); + teardown = !!(lockres->l_flags & USER_LOCK_IN_TEARDOWN); + spin_unlock(&lockres->l_lock); + if (!teardown) { + status = user_dlm_destroy_lock(lockres); + if (status < 0) + mlog_errno(status); + } iput(ip->ip_parent); goto clear_fields; } diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c index 9099d8fc7599..22da768e65b7 100644 --- a/fs/ocfs2/heartbeat.c +++ b/fs/ocfs2/heartbeat.c @@ -2,12 +2,13 @@ /* * heartbeat.c * - * Register ourselves with the heartbaet service, keep our node maps + * Register ourselves with the heartbeat service, keep our node maps * up to date, and fire off recovery when needed. * * Copyright (C) 2002, 2004 Oracle. All rights reserved. */ +#include <linux/bitmap.h> #include <linux/fs.h> #include <linux/types.h> #include <linux/highmem.h> @@ -24,18 +25,12 @@ #include "buffer_head_io.h" -static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, - int bit); -static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, - int bit); - /* special case -1 for now * TODO: should *really* make sure the calling func never passes -1!! */ static void ocfs2_node_map_init(struct ocfs2_node_map *map) { map->num_nodes = OCFS2_NODE_MAP_MAX_NODES; - memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) * - sizeof(unsigned long)); + bitmap_zero(map->map, OCFS2_NODE_MAP_MAX_NODES); } void ocfs2_init_node_maps(struct ocfs2_super *osb) @@ -65,12 +60,6 @@ void ocfs2_do_node_down(int node_num, void *data) ocfs2_recovery_thread(osb, node_num); } -static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, - int bit) -{ - set_bit(bit, map->map); -} - void ocfs2_node_map_set_bit(struct ocfs2_super *osb, struct ocfs2_node_map *map, int bit) @@ -79,16 +68,10 @@ void ocfs2_node_map_set_bit(struct ocfs2_super *osb, return; BUG_ON(bit >= map->num_nodes); spin_lock(&osb->node_map_lock); - __ocfs2_node_map_set_bit(map, bit); + set_bit(bit, map->map); spin_unlock(&osb->node_map_lock); } -static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, - int bit) -{ - clear_bit(bit, map->map); -} - void ocfs2_node_map_clear_bit(struct ocfs2_super *osb, struct ocfs2_node_map *map, int bit) @@ -97,7 +80,7 @@ void ocfs2_node_map_clear_bit(struct ocfs2_super *osb, return; BUG_ON(bit >= map->num_nodes); spin_lock(&osb->node_map_lock); - __ocfs2_node_map_clear_bit(map, bit); + clear_bit(bit, map->map); spin_unlock(&osb->node_map_lock); } diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 0b6f551a342a..dc9f76ab7e13 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -412,7 +412,7 @@ out_unlock: goto out_err; } -/* Write information to global quota file. Expects exlusive lock on quota +/* Write information to global quota file. Expects exclusive lock on quota * file inode and quota info */ static int __ocfs2_global_write_info(struct super_block *sb, int type) { diff --git a/fs/proc/array.c b/fs/proc/array.c index eb815759842c..99fcbfda8e25 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -69,7 +69,6 @@ #include <linux/sched/cputime.h> #include <linux/proc_fs.h> #include <linux/ioport.h> -#include <linux/uaccess.h> #include <linux/io.h> #include <linux/mm.h> #include <linux/hugetlb.h> @@ -100,6 +99,10 @@ void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape) { char tcomm[64]; + /* + * Test before PF_KTHREAD because all workqueue worker threads are + * kernel threads. + */ if (p->flags & PF_WQ_WORKER) wq_worker_comm(tcomm, sizeof(tcomm), p); else if (p->flags & PF_KTHREAD) diff --git a/fs/proc/base.c b/fs/proc/base.c index 8dfa36a99c74..93f7e3d971e4 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1885,7 +1885,7 @@ void proc_pid_evict_inode(struct proc_inode *ei) put_pid(pid); } -struct inode *proc_pid_make_inode(struct super_block * sb, +struct inode *proc_pid_make_inode(struct super_block *sb, struct task_struct *task, umode_t mode) { struct inode * inode; @@ -1914,11 +1914,6 @@ struct inode *proc_pid_make_inode(struct super_block * sb, /* Let the pid remember us for quick removal */ ei->pid = pid; - if (S_ISDIR(mode)) { - spin_lock(&pid->lock); - hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes); - spin_unlock(&pid->lock); - } task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); security_task_to_inode(task, inode); @@ -1931,6 +1926,39 @@ out_unlock: return NULL; } +/* + * Generating an inode and adding it into @pid->inodes, so that task will + * invalidate inode's dentry before being released. + * + * This helper is used for creating dir-type entries under '/proc' and + * '/proc/<tgid>/task'. Other entries(eg. fd, stat) under '/proc/<tgid>' + * can be released by invalidating '/proc/<tgid>' dentry. + * In theory, dentries under '/proc/<tgid>/task' can also be released by + * invalidating '/proc/<tgid>' dentry, we reserve it to handle single + * thread exiting situation: Any one of threads should invalidate its + * '/proc/<tgid>/task/<pid>' dentry before released. + */ +static struct inode *proc_pid_make_base_inode(struct super_block *sb, + struct task_struct *task, umode_t mode) +{ + struct inode *inode; + struct proc_inode *ei; + struct pid *pid; + + inode = proc_pid_make_inode(sb, task, mode); + if (!inode) + return NULL; + + /* Let proc_flush_pid find this directory inode */ + ei = PROC_I(inode); + pid = ei->pid; + spin_lock(&pid->lock); + hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes); + spin_unlock(&pid->lock); + + return inode; +} + int pid_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -3369,7 +3397,8 @@ static struct dentry *proc_pid_instantiate(struct dentry * dentry, { struct inode *inode; - inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); + inode = proc_pid_make_base_inode(dentry->d_sb, task, + S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) return ERR_PTR(-ENOENT); @@ -3671,7 +3700,8 @@ static struct dentry *proc_task_instantiate(struct dentry *dentry, struct task_struct *task, const void *ptr) { struct inode *inode; - inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); + inode = proc_pid_make_base_inode(dentry->d_sb, task, + S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) return ERR_PTR(-ENOENT); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 73aeb4e6d32e..f130499ad843 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -26,8 +26,6 @@ #include <linux/mount.h> #include <linux/bug.h> -#include <linux/uaccess.h> - #include "internal.h" static void proc_evict_inode(struct inode *inode) @@ -214,7 +212,15 @@ static void unuse_pde(struct proc_dir_entry *pde) complete(pde->pde_unload_completion); } -/* pde is locked on entry, unlocked on exit */ +/* + * At most 2 contexts can enter this function: the one doing the last + * close on the descriptor and whoever is deleting PDE itself. + * + * First to enter calls ->proc_release hook and signals its completion + * to the second one which waits and then does nothing. + * + * PDE is locked on entry, unlocked on exit. + */ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) __releases(&pde->pde_unload_lock) { @@ -224,9 +230,6 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) * * rmmod (remove_proc_entry() et al) can't delete an entry and proceed: * "struct file" needs to be available at the right moment. - * - * Therefore, first process to enter this function does ->release() and - * signals its completion to the other process which does nothing. */ if (pdeo->closing) { /* somebody else is doing that, just wait */ @@ -240,10 +243,12 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) pdeo->closing = true; spin_unlock(&pde->pde_unload_lock); + file = pdeo->file; pde->proc_ops->proc_release(file_inode(file), file); + spin_lock(&pde->pde_unload_lock); - /* After ->release. */ + /* Strictly after ->proc_release, see above. */ list_del(&pdeo->lh); c = pdeo->c; spin_unlock(&pde->pde_unload_lock); diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index b38ad552887f..592e6dc7c110 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -15,7 +15,6 @@ #include <linux/fs.h> #include <linux/syslog.h> -#include <linux/uaccess.h> #include <asm/io.h> extern wait_queue_head_t log_wait; diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 13452b32e2bd..4d3493579458 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -21,7 +21,6 @@ #include <linux/seq_file.h> #include <linux/hugetlb.h> #include <linux/vmalloc.h> -#include <linux/uaccess.h> #include <asm/tlb.h> #include <asm/div64.h> #include "internal.h" diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 913e5acefbb6..856839b8ae8b 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -8,9 +8,6 @@ * * proc net directory handling functions */ - -#include <linux/uaccess.h> - #include <linux/errno.h> #include <linux/time.h> #include <linux/proc_fs.h> @@ -353,6 +350,12 @@ static __net_init int proc_net_ns_init(struct net *net) kgid_t gid; int err; + /* + * This PDE acts only as an anchor for /proc/${pid}/net hierarchy. + * Corresponding inode (PDE(inode) == net->proc_net) is never + * instantiated therefore blanket zeroing is fine. + * net->proc_net_stat inode is instantiated normally. + */ err = -ENOMEM; netd = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL); if (!netd) diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c index c69ff191e5d8..5c6a5ceab2f1 100644 --- a/fs/proc/proc_tty.c +++ b/fs/proc/proc_tty.c @@ -4,8 +4,6 @@ * * Copyright 1997, Theodore Ts'o */ - -#include <linux/uaccess.h> #include <linux/module.h> #include <linux/init.h> #include <linux/errno.h> diff --git a/fs/proc/root.c b/fs/proc/root.c index c7e3b1350ef8..3c2ee3eb1138 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -6,9 +6,6 @@ * * proc root directory handling functions */ - -#include <linux/uaccess.h> - #include <linux/errno.h> #include <linux/time.h> #include <linux/proc_fs.h> @@ -305,6 +302,11 @@ void __init proc_root_init(void) proc_mkdir("bus", NULL); proc_sys_init(); + /* + * Last things last. It is not like userspace processes eager + * to open /proc files exist at this point but register last + * anyway. + */ register_filesystem(&proc_fs_type); } diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 4eaeb645e759..f2aa86c421f2 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -25,7 +25,6 @@ #include <linux/mutex.h> #include <linux/vmalloc.h> #include <linux/pagemap.h> -#include <linux/uaccess.h> #include <linux/uio.h> #include <linux/cc_platform.h> #include <asm/io.h> diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile index 7bd9b8b856d0..477c89a519ee 100644 --- a/fs/squashfs/Makefile +++ b/fs/squashfs/Makefile @@ -5,9 +5,9 @@ obj-$(CONFIG_SQUASHFS) += squashfs.o squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o -squashfs-y += namei.o super.o symlink.o decompressor.o +squashfs-y += namei.o super.o symlink.o decompressor.o page_actor.o squashfs-$(CONFIG_SQUASHFS_FILE_CACHE) += file_cache.o -squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o page_actor.o +squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 8879d052f96c..833aca92301f 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -34,12 +34,15 @@ static int copy_bio_to_actor(struct bio *bio, struct squashfs_page_actor *actor, int offset, int req_length) { - void *actor_addr = squashfs_first_page(actor); + void *actor_addr; struct bvec_iter_all iter_all = {}; struct bio_vec *bvec = bvec_init_iter_all(&iter_all); int copied_bytes = 0; int actor_offset = 0; + squashfs_actor_nobuff(actor); + actor_addr = squashfs_first_page(actor); + if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) return 0; @@ -49,8 +52,9 @@ static int copy_bio_to_actor(struct bio *bio, bytes_to_copy = min_t(int, bytes_to_copy, req_length - copied_bytes); - memcpy(actor_addr + actor_offset, bvec_virt(bvec) + offset, - bytes_to_copy); + if (!IS_ERR(actor_addr)) + memcpy(actor_addr + actor_offset, bvec_virt(bvec) + + offset, bytes_to_copy); actor_offset += bytes_to_copy; copied_bytes += bytes_to_copy; diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h index 1b9ccfd0aa51..19ab60834389 100644 --- a/fs/squashfs/decompressor.h +++ b/fs/squashfs/decompressor.h @@ -20,6 +20,7 @@ struct squashfs_decompressor { struct bio *, int, int, struct squashfs_page_actor *); int id; char *name; + int alloc_buffer; int supported; }; diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index 7f0904b20329..98e64fec75b7 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -39,6 +39,7 @@ #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" +#include "page_actor.h" /* * Locate cache slot in range [offset, index] for specified inode. If @@ -496,7 +497,137 @@ out: return res; } +static int squashfs_readahead_fragment(struct page **page, + unsigned int pages, unsigned int expected) +{ + struct inode *inode = page[0]->mapping->host; + struct squashfs_cache_entry *buffer = squashfs_get_fragment(inode->i_sb, + squashfs_i(inode)->fragment_block, + squashfs_i(inode)->fragment_size); + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + unsigned int n, mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; + + if (buffer->error) + goto out; + + expected += squashfs_i(inode)->fragment_offset; + + for (n = 0; n < pages; n++) { + unsigned int base = (page[n]->index & mask) << PAGE_SHIFT; + unsigned int offset = base + squashfs_i(inode)->fragment_offset; + + if (expected > offset) { + unsigned int avail = min_t(unsigned int, expected - + offset, PAGE_SIZE); + + squashfs_fill_page(page[n], buffer, offset, avail); + } + + unlock_page(page[n]); + put_page(page[n]); + } + +out: + squashfs_cache_put(buffer); + return buffer->error; +} + +static void squashfs_readahead(struct readahead_control *ractl) +{ + struct inode *inode = ractl->mapping->host; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + size_t mask = (1UL << msblk->block_log) - 1; + unsigned short shift = msblk->block_log - PAGE_SHIFT; + loff_t start = readahead_pos(ractl) & ~mask; + size_t len = readahead_length(ractl) + readahead_pos(ractl) - start; + struct squashfs_page_actor *actor; + unsigned int nr_pages = 0; + struct page **pages; + int i, file_end = i_size_read(inode) >> msblk->block_log; + unsigned int max_pages = 1UL << shift; + + readahead_expand(ractl, start, (len | mask) + 1); + + pages = kmalloc_array(max_pages, sizeof(void *), GFP_KERNEL); + if (!pages) + return; + + for (;;) { + pgoff_t index; + int res, bsize; + u64 block = 0; + unsigned int expected; + + nr_pages = __readahead_batch(ractl, pages, max_pages); + if (!nr_pages) + break; + + if (readahead_pos(ractl) >= i_size_read(inode)) + goto skip_pages; + + index = pages[0]->index >> shift; + if ((pages[nr_pages - 1]->index >> shift) != index) + goto skip_pages; + + expected = index == file_end ? + (i_size_read(inode) & (msblk->block_size - 1)) : + msblk->block_size; + + if (index == file_end && squashfs_i(inode)->fragment_block != + SQUASHFS_INVALID_BLK) { + res = squashfs_readahead_fragment(pages, nr_pages, + expected); + if (res) + goto skip_pages; + continue; + } + + bsize = read_blocklist(inode, index, &block); + if (bsize == 0) + goto skip_pages; + + actor = squashfs_page_actor_init_special(msblk, pages, nr_pages, + expected); + if (!actor) + goto skip_pages; + + res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor); + + kfree(actor); + + if (res == expected) { + int bytes; + + /* Last page (if present) may have trailing bytes not filled */ + bytes = res % PAGE_SIZE; + if (pages[nr_pages - 1]->index == file_end && bytes) + memzero_page(pages[nr_pages - 1], bytes, + PAGE_SIZE - bytes); + + for (i = 0; i < nr_pages; i++) { + flush_dcache_page(pages[i]); + SetPageUptodate(pages[i]); + } + } + + for (i = 0; i < nr_pages; i++) { + unlock_page(pages[i]); + put_page(pages[i]); + } + } + + kfree(pages); + return; + +skip_pages: + for (i = 0; i < nr_pages; i++) { + unlock_page(pages[i]); + put_page(pages[i]); + } + kfree(pages); +} const struct address_space_operations squashfs_aops = { - .read_folio = squashfs_read_folio + .read_folio = squashfs_read_folio, + .readahead = squashfs_readahead }; diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index a4894cc59447..be4b12d31e0c 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -18,9 +18,6 @@ #include "squashfs.h" #include "page_actor.h" -static int squashfs_read_cache(struct page *target_page, u64 block, int bsize, - int pages, struct page **page, int bytes); - /* Read separately compressed datablock directly into page cache */ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, int expected) @@ -33,7 +30,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; int start_index = target_page->index & ~mask; int end_index = start_index | mask; - int i, n, pages, missing_pages, bytes, res = -ENOMEM; + int i, n, pages, bytes, res = -ENOMEM; struct page **page; struct squashfs_page_actor *actor; void *pageaddr; @@ -47,50 +44,38 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, if (page == NULL) return res; - /* - * Create a "page actor" which will kmap and kunmap the - * page cache pages appropriately within the decompressor - */ - actor = squashfs_page_actor_init_special(page, pages, 0); - if (actor == NULL) - goto out; - /* Try to grab all the pages covered by the Squashfs block */ - for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) { + for (i = 0, n = start_index; n <= end_index; n++) { page[i] = (n == target_page->index) ? target_page : grab_cache_page_nowait(target_page->mapping, n); - if (page[i] == NULL) { - missing_pages++; + if (page[i] == NULL) continue; - } if (PageUptodate(page[i])) { unlock_page(page[i]); put_page(page[i]); - page[i] = NULL; - missing_pages++; + continue; } + + i++; } - if (missing_pages) { - /* - * Couldn't get one or more pages, this page has either - * been VM reclaimed, but others are still in the page cache - * and uptodate, or we're racing with another thread in - * squashfs_readpage also trying to grab them. Fall back to - * using an intermediate buffer. - */ - res = squashfs_read_cache(target_page, block, bsize, pages, - page, expected); - if (res < 0) - goto mark_errored; + pages = i; + /* + * Create a "page actor" which will kmap and kunmap the + * page cache pages appropriately within the decompressor + */ + actor = squashfs_page_actor_init_special(msblk, page, pages, expected); + if (actor == NULL) goto out; - } /* Decompress directly into the page cache buffers */ res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor); + + kfree(actor); + if (res < 0) goto mark_errored; @@ -99,12 +84,12 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, goto mark_errored; } - /* Last page may have trailing bytes not filled */ + /* Last page (if present) may have trailing bytes not filled */ bytes = res % PAGE_SIZE; - if (bytes) { - pageaddr = kmap_atomic(page[pages - 1]); + if (page[pages - 1]->index == end_index && bytes) { + pageaddr = kmap_local_page(page[pages - 1]); memset(pageaddr + bytes, 0, PAGE_SIZE - bytes); - kunmap_atomic(pageaddr); + kunmap_local(pageaddr); } /* Mark pages as uptodate, unlock and release */ @@ -116,7 +101,6 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, put_page(page[i]); } - kfree(actor); kfree(page); return 0; @@ -135,40 +119,6 @@ mark_errored: } out: - kfree(actor); kfree(page); return res; } - - -static int squashfs_read_cache(struct page *target_page, u64 block, int bsize, - int pages, struct page **page, int bytes) -{ - struct inode *i = target_page->mapping->host; - struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb, - block, bsize); - int res = buffer->error, n, offset = 0; - - if (res) { - ERROR("Unable to read page, block %llx, size %x\n", block, - bsize); - goto out; - } - - for (n = 0; n < pages && bytes > 0; n++, - bytes -= PAGE_SIZE, offset += PAGE_SIZE) { - int avail = min_t(int, bytes, PAGE_SIZE); - - if (page[n] == NULL) - continue; - - squashfs_fill_page(page[n], buffer, offset, avail); - unlock_page(page[n]); - if (page[n] != target_page) - put_page(page[n]); - } - -out: - squashfs_cache_put(buffer); - return res; -} diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c index b685b6238316..49797729f143 100644 --- a/fs/squashfs/lz4_wrapper.c +++ b/fs/squashfs/lz4_wrapper.c @@ -119,10 +119,12 @@ static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm, buff = stream->output; while (data) { if (bytes <= PAGE_SIZE) { - memcpy(data, buff, bytes); + if (!IS_ERR(data)) + memcpy(data, buff, bytes); break; } - memcpy(data, buff, PAGE_SIZE); + if (!IS_ERR(data)) + memcpy(data, buff, PAGE_SIZE); buff += PAGE_SIZE; bytes -= PAGE_SIZE; data = squashfs_next_page(output); @@ -139,5 +141,6 @@ const struct squashfs_decompressor squashfs_lz4_comp_ops = { .decompress = lz4_uncompress, .id = LZ4_COMPRESSION, .name = "lz4", + .alloc_buffer = 0, .supported = 1 }; diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c index cb510a631968..d216aeefa865 100644 --- a/fs/squashfs/lzo_wrapper.c +++ b/fs/squashfs/lzo_wrapper.c @@ -93,10 +93,12 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm, buff = stream->output; while (data) { if (bytes <= PAGE_SIZE) { - memcpy(data, buff, bytes); + if (!IS_ERR(data)) + memcpy(data, buff, bytes); break; } else { - memcpy(data, buff, PAGE_SIZE); + if (!IS_ERR(data)) + memcpy(data, buff, PAGE_SIZE); buff += PAGE_SIZE; bytes -= PAGE_SIZE; data = squashfs_next_page(output); @@ -116,5 +118,6 @@ const struct squashfs_decompressor squashfs_lzo_comp_ops = { .decompress = lzo_uncompress, .id = LZO_COMPRESSION, .name = "lzo", + .alloc_buffer = 0, .supported = 1 }; diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c index 520d323a99ce..b23b780d8f42 100644 --- a/fs/squashfs/page_actor.c +++ b/fs/squashfs/page_actor.c @@ -7,6 +7,8 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/pagemap.h> +#include "squashfs_fs_sb.h" +#include "decompressor.h" #include "page_actor.h" /* @@ -57,29 +59,62 @@ struct squashfs_page_actor *squashfs_page_actor_init(void **buffer, } /* Implementation of page_actor for decompressing directly into page cache. */ +static void *handle_next_page(struct squashfs_page_actor *actor) +{ + int max_pages = (actor->length + PAGE_SIZE - 1) >> PAGE_SHIFT; + + if (actor->returned_pages == max_pages) + return NULL; + + if ((actor->next_page == actor->pages) || + (actor->next_index != actor->page[actor->next_page]->index)) { + if (actor->alloc_buffer) { + void *tmp_buffer = kmalloc(PAGE_SIZE, GFP_KERNEL); + + if (tmp_buffer) { + actor->tmp_buffer = tmp_buffer; + actor->next_index++; + actor->returned_pages++; + return tmp_buffer; + } + } + + actor->next_index++; + actor->returned_pages++; + return ERR_PTR(-ENOMEM); + } + + actor->next_index++; + actor->returned_pages++; + return actor->pageaddr = kmap_local_page(actor->page[actor->next_page++]); +} + static void *direct_first_page(struct squashfs_page_actor *actor) { - actor->next_page = 1; - return actor->pageaddr = kmap_atomic(actor->page[0]); + return handle_next_page(actor); } static void *direct_next_page(struct squashfs_page_actor *actor) { if (actor->pageaddr) - kunmap_atomic(actor->pageaddr); + kunmap_local(actor->pageaddr); + + kfree(actor->tmp_buffer); + actor->pageaddr = actor->tmp_buffer = NULL; - return actor->pageaddr = actor->next_page == actor->pages ? NULL : - kmap_atomic(actor->page[actor->next_page++]); + return handle_next_page(actor); } static void direct_finish_page(struct squashfs_page_actor *actor) { if (actor->pageaddr) - kunmap_atomic(actor->pageaddr); + kunmap_local(actor->pageaddr); + + kfree(actor->tmp_buffer); } -struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page, - int pages, int length) +struct squashfs_page_actor *squashfs_page_actor_init_special(struct squashfs_sb_info *msblk, + struct page **page, int pages, int length) { struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); @@ -90,7 +125,11 @@ struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page, actor->page = page; actor->pages = pages; actor->next_page = 0; + actor->returned_pages = 0; + actor->next_index = page[0]->index & ~((1 << (msblk->block_log - PAGE_SHIFT)) - 1); actor->pageaddr = NULL; + actor->tmp_buffer = NULL; + actor->alloc_buffer = msblk->decompressor->alloc_buffer; actor->squashfs_first_page = direct_first_page; actor->squashfs_next_page = direct_next_page; actor->squashfs_finish_page = direct_finish_page; diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h index 2e3073ace009..24841d28bc0f 100644 --- a/fs/squashfs/page_actor.h +++ b/fs/squashfs/page_actor.h @@ -6,63 +6,29 @@ * Phillip Lougher <phillip@squashfs.org.uk> */ -#ifndef CONFIG_SQUASHFS_FILE_DIRECT -struct squashfs_page_actor { - void **page; - int pages; - int length; - int next_page; -}; - -static inline struct squashfs_page_actor *squashfs_page_actor_init(void **page, - int pages, int length) -{ - struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); - - if (actor == NULL) - return NULL; - - actor->length = length ? : pages * PAGE_SIZE; - actor->page = page; - actor->pages = pages; - actor->next_page = 0; - return actor; -} - -static inline void *squashfs_first_page(struct squashfs_page_actor *actor) -{ - actor->next_page = 1; - return actor->page[0]; -} - -static inline void *squashfs_next_page(struct squashfs_page_actor *actor) -{ - return actor->next_page == actor->pages ? NULL : - actor->page[actor->next_page++]; -} - -static inline void squashfs_finish_page(struct squashfs_page_actor *actor) -{ - /* empty */ -} -#else struct squashfs_page_actor { union { void **buffer; struct page **page; }; void *pageaddr; + void *tmp_buffer; void *(*squashfs_first_page)(struct squashfs_page_actor *); void *(*squashfs_next_page)(struct squashfs_page_actor *); void (*squashfs_finish_page)(struct squashfs_page_actor *); int pages; int length; int next_page; + int alloc_buffer; + int returned_pages; + pgoff_t next_index; }; -extern struct squashfs_page_actor *squashfs_page_actor_init(void **, int, int); -extern struct squashfs_page_actor *squashfs_page_actor_init_special(struct page - **, int, int); +extern struct squashfs_page_actor *squashfs_page_actor_init(void **buffer, + int pages, int length); +extern struct squashfs_page_actor *squashfs_page_actor_init_special( + struct squashfs_sb_info *msblk, + struct page **page, int pages, int length); static inline void *squashfs_first_page(struct squashfs_page_actor *actor) { return actor->squashfs_first_page(actor); @@ -75,5 +41,8 @@ static inline void squashfs_finish_page(struct squashfs_page_actor *actor) { actor->squashfs_finish_page(actor); } -#endif +static inline void squashfs_actor_nobuff(struct squashfs_page_actor *actor) +{ + actor->alloc_buffer = 0; +} #endif diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 6d594ba2ed28..32565dafa7f3 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -29,7 +29,6 @@ #include <linux/module.h> #include <linux/magic.h> #include <linux/xattr.h> -#include <linux/backing-dev.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -113,24 +112,6 @@ static const struct squashfs_decompressor *supported_squashfs_filesystem( return decompressor; } -static int squashfs_bdi_init(struct super_block *sb) -{ - int err; - unsigned int major = MAJOR(sb->s_dev); - unsigned int minor = MINOR(sb->s_dev); - - bdi_put(sb->s_bdi); - sb->s_bdi = &noop_backing_dev_info; - - err = super_setup_bdi_name(sb, "squashfs_%u_%u", major, minor); - if (err) - return err; - - sb->s_bdi->ra_pages = 0; - sb->s_bdi->io_pages = 0; - - return 0; -} static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) { @@ -146,20 +127,6 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) TRACE("Entered squashfs_fill_superblock\n"); - /* - * squashfs provides 'backing_dev_info' in order to disable read-ahead. For - * squashfs, I/O is not deferred, it is done immediately in read_folio, - * which means the user would always have to wait their own I/O. So the effect - * of readahead is very weak for squashfs. squashfs_bdi_init will set - * sb->s_bdi->ra_pages and sb->s_bdi->io_pages to 0 and close readahead for - * squashfs. - */ - err = squashfs_bdi_init(sb); - if (err) { - errorf(fc, "squashfs init bdi failed"); - return err; - } - sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL); if (sb->s_fs_info == NULL) { ERROR("Failed to allocate squashfs_sb_info\n"); diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c index 68f6d09bb3a2..6c49481a2f8c 100644 --- a/fs/squashfs/xz_wrapper.c +++ b/fs/squashfs/xz_wrapper.c @@ -131,6 +131,10 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, stream->buf.out_pos = 0; stream->buf.out_size = PAGE_SIZE; stream->buf.out = squashfs_first_page(output); + if (IS_ERR(stream->buf.out)) { + error = PTR_ERR(stream->buf.out); + goto finish; + } for (;;) { enum xz_ret xz_err; @@ -156,7 +160,10 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, if (stream->buf.out_pos == stream->buf.out_size) { stream->buf.out = squashfs_next_page(output); - if (stream->buf.out != NULL) { + if (IS_ERR(stream->buf.out)) { + error = PTR_ERR(stream->buf.out); + break; + } else if (stream->buf.out != NULL) { stream->buf.out_pos = 0; total += PAGE_SIZE; } @@ -171,6 +178,7 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, } } +finish: squashfs_finish_page(output); return error ? error : total + stream->buf.out_pos; @@ -183,5 +191,6 @@ const struct squashfs_decompressor squashfs_xz_comp_ops = { .decompress = squashfs_xz_uncompress, .id = XZ_COMPRESSION, .name = "xz", + .alloc_buffer = 1, .supported = 1 }; diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c index a20e9042146b..cbb7afe7bc46 100644 --- a/fs/squashfs/zlib_wrapper.c +++ b/fs/squashfs/zlib_wrapper.c @@ -62,6 +62,11 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, stream->next_out = squashfs_first_page(output); stream->avail_in = 0; + if (IS_ERR(stream->next_out)) { + error = PTR_ERR(stream->next_out); + goto finish; + } + for (;;) { int zlib_err; @@ -85,7 +90,10 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, if (stream->avail_out == 0) { stream->next_out = squashfs_next_page(output); - if (stream->next_out != NULL) + if (IS_ERR(stream->next_out)) { + error = PTR_ERR(stream->next_out); + break; + } else if (stream->next_out != NULL) stream->avail_out = PAGE_SIZE; } @@ -107,6 +115,7 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, } } +finish: squashfs_finish_page(output); if (!error) @@ -122,6 +131,7 @@ const struct squashfs_decompressor squashfs_zlib_comp_ops = { .decompress = zlib_uncompress, .id = ZLIB_COMPRESSION, .name = "zlib", + .alloc_buffer = 1, .supported = 1 }; diff --git a/fs/squashfs/zstd_wrapper.c b/fs/squashfs/zstd_wrapper.c index c40445dbf38c..0e407c4d8b3b 100644 --- a/fs/squashfs/zstd_wrapper.c +++ b/fs/squashfs/zstd_wrapper.c @@ -80,6 +80,10 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, out_buf.size = PAGE_SIZE; out_buf.dst = squashfs_first_page(output); + if (IS_ERR(out_buf.dst)) { + error = PTR_ERR(out_buf.dst); + goto finish; + } for (;;) { size_t zstd_err; @@ -104,7 +108,10 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, if (out_buf.pos == out_buf.size) { out_buf.dst = squashfs_next_page(output); - if (out_buf.dst == NULL) { + if (IS_ERR(out_buf.dst)) { + error = PTR_ERR(out_buf.dst); + break; + } else if (out_buf.dst == NULL) { /* Shouldn't run out of pages * before stream is done. */ @@ -129,6 +136,8 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, } } +finish: + squashfs_finish_page(output); return error ? error : total_out; @@ -140,5 +149,6 @@ const struct squashfs_decompressor squashfs_zstd_comp_ops = { .decompress = zstd_uncompress, .id = ZSTD_COMPRESSION, .name = "zstd", + .alloc_buffer = 1, .supported = 1 }; |