diff options
Diffstat (limited to 'fs')
139 files changed, 4221 insertions, 2180 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index c061c3f18e7c..24eb01087b6d 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -30,8 +30,8 @@ #include <linux/parser.h> #include <linux/idr.h> #include <net/9p/9p.h> -#include <net/9p/transport.h> #include <net/9p/client.h> +#include <net/9p/transport.h> #include "v9fs.h" #include "v9fs_vfs.h" @@ -234,7 +234,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, if (!v9ses->clnt->dotu) v9ses->flags &= ~V9FS_EXTENDED; - v9ses->maxdata = v9ses->clnt->msize; + v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; /* for legacy mode, fall back to V9FS_ACCESS_ANY */ if (!v9fs_extended(v9ses) && diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 57997fa14e69..c295ba786edd 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -46,9 +46,11 @@ extern struct dentry_operations v9fs_cached_dentry_operations; struct inode *v9fs_get_inode(struct super_block *sb, int mode); ino_t v9fs_qid2ino(struct p9_qid *qid); -void v9fs_stat2inode(struct p9_stat *, struct inode *, struct super_block *); +void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *); int v9fs_dir_release(struct inode *inode, struct file *filp); int v9fs_file_open(struct inode *inode, struct file *file); -void v9fs_inode2stat(struct inode *inode, struct p9_stat *stat); +void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat); void v9fs_dentry_release(struct dentry *); int v9fs_uflags2omode(int uflags, int extended); + +ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64); diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 97d3aed57983..6fcb1e7095cf 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -38,7 +38,6 @@ #include "v9fs.h" #include "v9fs_vfs.h" -#include "fid.h" /** * v9fs_vfs_readpage - read an entire page in from 9P @@ -53,14 +52,12 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page) int retval; loff_t offset; char *buffer; - struct p9_fid *fid; P9_DPRINTK(P9_DEBUG_VFS, "\n"); - fid = filp->private_data; buffer = kmap(page); offset = page_offset(page); - retval = p9_client_readn(fid, buffer, offset, PAGE_CACHE_SIZE); + retval = v9fs_file_readn(filp, buffer, NULL, offset, PAGE_CACHE_SIZE); if (retval < 0) goto done; diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index e298fe194093..873cd31baa47 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -45,7 +45,7 @@ * */ -static inline int dt_type(struct p9_stat *mistat) +static inline int dt_type(struct p9_wstat *mistat) { unsigned long perm = mistat->mode; int rettype = DT_REG; @@ -69,32 +69,58 @@ static inline int dt_type(struct p9_stat *mistat) static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) { int over; + struct p9_wstat st; + int err; struct p9_fid *fid; - struct v9fs_session_info *v9ses; - struct inode *inode; - struct p9_stat *st; + int buflen; + char *statbuf; + int n, i = 0; P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); - inode = filp->f_path.dentry->d_inode; - v9ses = v9fs_inode2v9ses(inode); fid = filp->private_data; - while ((st = p9_client_dirread(fid, filp->f_pos)) != NULL) { - if (IS_ERR(st)) - return PTR_ERR(st); - over = filldir(dirent, st->name.str, st->name.len, filp->f_pos, - v9fs_qid2ino(&st->qid), dt_type(st)); + buflen = fid->clnt->msize - P9_IOHDRSZ; + statbuf = kmalloc(buflen, GFP_KERNEL); + if (!statbuf) + return -ENOMEM; - if (over) + while (1) { + err = v9fs_file_readn(filp, statbuf, NULL, buflen, + fid->rdir_fpos); + if (err <= 0) break; - filp->f_pos += st->size; - kfree(st); - st = NULL; + n = err; + while (i < n) { + err = p9stat_read(statbuf + i, buflen-i, &st, + fid->clnt->dotu); + if (err) { + P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); + err = -EIO; + p9stat_free(&st); + goto free_and_exit; + } + + i += st.size+2; + fid->rdir_fpos += st.size+2; + + over = filldir(dirent, st.name, strlen(st.name), + filp->f_pos, v9fs_qid2ino(&st.qid), dt_type(&st)); + + filp->f_pos += st.size+2; + + p9stat_free(&st); + + if (over) { + err = 0; + goto free_and_exit; + } + } } - kfree(st); - return 0; +free_and_exit: + kfree(statbuf); + return err; } diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 52944d2249a4..041c52692284 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -120,23 +120,72 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl) } /** - * v9fs_file_read - read from a file + * v9fs_file_readn - read from a file * @filp: file pointer to read * @data: data buffer to read data into + * @udata: user data buffer to read data into * @count: size of buffer * @offset: offset at which to read data * */ + +ssize_t +v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count, + u64 offset) +{ + int n, total; + struct p9_fid *fid = filp->private_data; + + P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid, + (long long unsigned) offset, count); + + n = 0; + total = 0; + do { + n = p9_client_read(fid, data, udata, offset, count); + if (n <= 0) + break; + + if (data) + data += n; + if (udata) + udata += n; + + offset += n; + count -= n; + total += n; + } while (count > 0 && n == (fid->clnt->msize - P9_IOHDRSZ)); + + if (n < 0) + total = n; + + return total; +} + +/** + * v9fs_file_read - read from a file + * @filp: file pointer to read + * @udata: user data buffer to read data into + * @count: size of buffer + * @offset: offset at which to read data + * + */ + static ssize_t -v9fs_file_read(struct file *filp, char __user * data, size_t count, +v9fs_file_read(struct file *filp, char __user *udata, size_t count, loff_t * offset) { int ret; struct p9_fid *fid; - P9_DPRINTK(P9_DEBUG_VFS, "\n"); + P9_DPRINTK(P9_DEBUG_VFS, "count %d offset %lld\n", count, *offset); fid = filp->private_data; - ret = p9_client_uread(fid, data, *offset, count); + + if (count > (fid->clnt->msize - P9_IOHDRSZ)) + ret = v9fs_file_readn(filp, NULL, udata, count, *offset); + else + ret = p9_client_read(fid, NULL, udata, *offset, count); + if (ret > 0) *offset += ret; @@ -156,19 +205,38 @@ static ssize_t v9fs_file_write(struct file *filp, const char __user * data, size_t count, loff_t * offset) { - int ret; + int n, rsize, total = 0; struct p9_fid *fid; + struct p9_client *clnt; struct inode *inode = filp->f_path.dentry->d_inode; + int origin = *offset; P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data, (int)count, (int)*offset); fid = filp->private_data; - ret = p9_client_uwrite(fid, data, *offset, count); - if (ret > 0) { - invalidate_inode_pages2_range(inode->i_mapping, *offset, - *offset+ret); - *offset += ret; + clnt = fid->clnt; + + rsize = fid->iounit; + if (!rsize || rsize > clnt->msize-P9_IOHDRSZ) + rsize = clnt->msize - P9_IOHDRSZ; + + do { + if (count < rsize) + rsize = count; + + n = p9_client_write(fid, NULL, data+total, *offset+total, + rsize); + if (n <= 0) + break; + count -= n; + total += n; + } while (count > 0); + + if (total > 0) { + invalidate_inode_pages2_range(inode->i_mapping, origin, + origin+total); + *offset += total; } if (*offset > inode->i_size) { @@ -176,7 +244,10 @@ v9fs_file_write(struct file *filp, const char __user * data, inode->i_blocks = (inode->i_size + 512 - 1) >> 9; } - return ret; + if (n < 0) + return n; + + return total; } static const struct file_operations v9fs_cached_file_operations = { diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index e83aa5ebe861..8314d3f43b71 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -334,7 +334,7 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, { int err, umode; struct inode *ret; - struct p9_stat *st; + struct p9_wstat *st; ret = NULL; st = p9_client_stat(fid); @@ -417,6 +417,8 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, struct p9_fid *dfid, *ofid, *fid; struct inode *inode; + P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); + err = 0; ofid = NULL; fid = NULL; @@ -424,6 +426,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, dfid = v9fs_fid_clone(dentry->d_parent); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); + P9_DPRINTK(P9_DEBUG_VFS, "fid clone failed %d\n", err); dfid = NULL; goto error; } @@ -432,18 +435,22 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, ofid = p9_client_walk(dfid, 0, NULL, 1); if (IS_ERR(ofid)) { err = PTR_ERR(ofid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); ofid = NULL; goto error; } err = p9_client_fcreate(ofid, name, perm, mode, extension); - if (err < 0) + if (err < 0) { + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err); goto error; + } /* now walk from the parent so we can get unopened fid */ fid = p9_client_walk(dfid, 1, &name, 0); if (IS_ERR(fid)) { err = PTR_ERR(fid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); fid = NULL; goto error; } else @@ -453,6 +460,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); + P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); goto error; } @@ -734,7 +742,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, int err; struct v9fs_session_info *v9ses; struct p9_fid *fid; - struct p9_stat *st; + struct p9_wstat *st; P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry); err = -EPERM; @@ -815,10 +823,9 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) */ void -v9fs_stat2inode(struct p9_stat *stat, struct inode *inode, +v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, struct super_block *sb) { - int n; char ext[32]; struct v9fs_session_info *v9ses = sb->s_fs_info; @@ -842,11 +849,7 @@ v9fs_stat2inode(struct p9_stat *stat, struct inode *inode, int major = -1; int minor = -1; - n = stat->extension.len; - if (n > sizeof(ext)-1) - n = sizeof(ext)-1; - memmove(ext, stat->extension.str, n); - ext[n] = 0; + strncpy(ext, stat->extension, sizeof(ext)); sscanf(ext, "%c %u %u", &type, &major, &minor); switch (type) { case 'c': @@ -857,10 +860,11 @@ v9fs_stat2inode(struct p9_stat *stat, struct inode *inode, break; default: P9_DPRINTK(P9_DEBUG_ERROR, - "Unknown special type %c (%.*s)\n", type, - stat->extension.len, stat->extension.str); + "Unknown special type %c %s\n", type, + stat->extension); }; inode->i_rdev = MKDEV(major, minor); + init_special_inode(inode, inode->i_mode, inode->i_rdev); } else inode->i_rdev = 0; @@ -904,7 +908,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) struct v9fs_session_info *v9ses; struct p9_fid *fid; - struct p9_stat *st; + struct p9_wstat *st; P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name); retval = -EPERM; @@ -926,15 +930,10 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) } /* copy extension buffer into buffer */ - if (st->extension.len < buflen) - buflen = st->extension.len + 1; - - memmove(buffer, st->extension.str, buflen - 1); - buffer[buflen-1] = 0; + strncpy(buffer, st->extension, buflen); P9_DPRINTK(P9_DEBUG_VFS, - "%s -> %.*s (%s)\n", dentry->d_name.name, st->extension.len, - st->extension.str, buffer); + "%s -> %s (%s)\n", dentry->d_name.name, st->extension, buffer); retval = buflen; diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index bf59c3960494..d6cb1a0ca724 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -111,7 +111,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, struct inode *inode = NULL; struct dentry *root = NULL; struct v9fs_session_info *v9ses = NULL; - struct p9_stat *st = NULL; + struct p9_wstat *st = NULL; int mode = S_IRWXUGO | S_ISVTX; uid_t uid = current->fsuid; gid_t gid = current->fsgid; @@ -161,10 +161,14 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, sb->s_root = root; root->d_inode->i_ino = v9fs_qid2ino(&st->qid); + v9fs_stat2inode(st, root->d_inode, sb); + v9fs_fid_add(root, fid); + p9stat_free(st); kfree(st); +P9_DPRINTK(P9_DEBUG_VFS, " return simple set mount\n"); return simple_set_mnt(mnt, sb); release_sb: diff --git a/fs/Kconfig b/fs/Kconfig index 9c43045ebbf9..e46297f020c1 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -6,61 +6,9 @@ menu "File systems" if BLOCK -config EXT2_FS - tristate "Second extended fs support" - help - Ext2 is a standard Linux file system for hard disks. - - To compile this file system support as a module, choose M here: the - module will be called ext2. - - If unsure, say Y. - -config EXT2_FS_XATTR - bool "Ext2 extended attributes" - depends on EXT2_FS - help - Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page, or visit - <http://acl.bestbits.at/> for details). - - If unsure, say N. - -config EXT2_FS_POSIX_ACL - bool "Ext2 POSIX Access Control Lists" - depends on EXT2_FS_XATTR - select FS_POSIX_ACL - help - Posix Access Control Lists (ACLs) support permissions for users and - groups beyond the owner/group/world scheme. - - To learn more about Access Control Lists, visit the Posix ACLs for - Linux website <http://acl.bestbits.at/>. - - If you don't know what Access Control Lists are, say N - -config EXT2_FS_SECURITY - bool "Ext2 Security Labels" - depends on EXT2_FS_XATTR - help - Security labels support alternative access control models - implemented by security modules like SELinux. This option - enables an extended attribute handler for file security - labels in the ext2 filesystem. - - If you are not using a security module that requires using - extended attributes for file security labels, say N. - -config EXT2_FS_XIP - bool "Ext2 execute in place support" - depends on EXT2_FS && MMU - help - Execute in place can be used on memory-backed block devices. If you - enable this option, you can select to mount block devices which are - capable of this feature without using the page cache. - - If you do not use a block device that is capable of using this, - or if unsure, say N. +source "fs/ext2/Kconfig" +source "fs/ext3/Kconfig" +source "fs/ext4/Kconfig" config FS_XIP # execute in place @@ -68,218 +16,8 @@ config FS_XIP depends on EXT2_FS_XIP default y -config EXT3_FS - tristate "Ext3 journalling file system support" - select JBD - help - This is the journalling version of the Second extended file system - (often called ext3), the de facto standard Linux file system - (method to organize files on a storage device) for hard disks. - - The journalling code included in this driver means you do not have - to run e2fsck (file system checker) on your file systems after a - crash. The journal keeps track of any changes that were being made - at the time the system crashed, and can ensure that your file system - is consistent without the need for a lengthy check. - - Other than adding the journal to the file system, the on-disk format - of ext3 is identical to ext2. It is possible to freely switch - between using the ext3 driver and the ext2 driver, as long as the - file system has been cleanly unmounted, or e2fsck is run on the file - system. - - To add a journal on an existing ext2 file system or change the - behavior of ext3 file systems, you can use the tune2fs utility ("man - tune2fs"). To modify attributes of files and directories on ext3 - file systems, use chattr ("man chattr"). You need to be using - e2fsprogs version 1.20 or later in order to create ext3 journals - (available at <http://sourceforge.net/projects/e2fsprogs/>). - - To compile this file system support as a module, choose M here: the - module will be called ext3. - -config EXT3_FS_XATTR - bool "Ext3 extended attributes" - depends on EXT3_FS - default y - help - Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page, or visit - <http://acl.bestbits.at/> for details). - - If unsure, say N. - - You need this for POSIX ACL support on ext3. - -config EXT3_FS_POSIX_ACL - bool "Ext3 POSIX Access Control Lists" - depends on EXT3_FS_XATTR - select FS_POSIX_ACL - help - Posix Access Control Lists (ACLs) support permissions for users and - groups beyond the owner/group/world scheme. - - To learn more about Access Control Lists, visit the Posix ACLs for - Linux website <http://acl.bestbits.at/>. - - If you don't know what Access Control Lists are, say N - -config EXT3_FS_SECURITY - bool "Ext3 Security Labels" - depends on EXT3_FS_XATTR - help - Security labels support alternative access control models - implemented by security modules like SELinux. This option - enables an extended attribute handler for file security - labels in the ext3 filesystem. - - If you are not using a security module that requires using - extended attributes for file security labels, say N. - -config EXT4_FS - tristate "The Extended 4 (ext4) filesystem" - select JBD2 - select CRC16 - help - This is the next generation of the ext3 filesystem. - - Unlike the change from ext2 filesystem to ext3 filesystem, - the on-disk format of ext4 is not forwards compatible with - ext3; it is based on extent maps and it supports 48-bit - physical block numbers. The ext4 filesystem also supports delayed - allocation, persistent preallocation, high resolution time stamps, - and a number of other features to improve performance and speed - up fsck time. For more information, please see the web pages at - http://ext4.wiki.kernel.org. - - The ext4 filesystem will support mounting an ext3 - filesystem; while there will be some performance gains from - the delayed allocation and inode table readahead, the best - performance gains will require enabling ext4 features in the - filesystem, or formating a new filesystem as an ext4 - filesystem initially. - - To compile this file system support as a module, choose M here. The - module will be called ext4dev. - - If unsure, say N. - -config EXT4DEV_COMPAT - bool "Enable ext4dev compatibility" - depends on EXT4_FS - help - Starting with 2.6.28, the name of the ext4 filesystem was - renamed from ext4dev to ext4. Unfortunately there are some - legacy userspace programs (such as klibc's fstype) have - "ext4dev" hardcoded. - - To enable backwards compatibility so that systems that are - still expecting to mount ext4 filesystems using ext4dev, - chose Y here. This feature will go away by 2.6.31, so - please arrange to get your userspace programs fixed! - -config EXT4_FS_XATTR - bool "Ext4 extended attributes" - depends on EXT4_FS - default y - help - Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page, or visit - <http://acl.bestbits.at/> for details). - - If unsure, say N. - - You need this for POSIX ACL support on ext4. - -config EXT4_FS_POSIX_ACL - bool "Ext4 POSIX Access Control Lists" - depends on EXT4_FS_XATTR - select FS_POSIX_ACL - help - POSIX Access Control Lists (ACLs) support permissions for users and - groups beyond the owner/group/world scheme. - - To learn more about Access Control Lists, visit the POSIX ACLs for - Linux website <http://acl.bestbits.at/>. - - If you don't know what Access Control Lists are, say N - -config EXT4_FS_SECURITY - bool "Ext4 Security Labels" - depends on EXT4_FS_XATTR - help - Security labels support alternative access control models - implemented by security modules like SELinux. This option - enables an extended attribute handler for file security - labels in the ext4 filesystem. - - If you are not using a security module that requires using - extended attributes for file security labels, say N. - -config JBD - tristate - help - This is a generic journalling layer for block devices. It is - currently used by the ext3 file system, but it could also be - used to add journal support to other file systems or block - devices such as RAID or LVM. - - If you are using the ext3 file system, you need to say Y here. - If you are not using ext3 then you will probably want to say N. - - To compile this device as a module, choose M here: the module will be - called jbd. If you are compiling ext3 into the kernel, you - cannot compile this code as a module. - -config JBD_DEBUG - bool "JBD (ext3) debugging support" - depends on JBD && DEBUG_FS - help - If you are using the ext3 journaled file system (or potentially any - other file system/device using JBD), this option allows you to - enable debugging output while the system is running, in order to - help track down any problems you are having. By default the - debugging output will be turned off. - - If you select Y here, then you will be able to turn on debugging - with "echo N > /sys/kernel/debug/jbd/jbd-debug", where N is a - number between 1 and 5, the higher the number, the more debugging - output is generated. To turn debugging off again, do - "echo 0 > /sys/kernel/debug/jbd/jbd-debug". - -config JBD2 - tristate - select CRC32 - help - This is a generic journaling layer for block devices that support - both 32-bit and 64-bit block numbers. It is currently used by - the ext4 and OCFS2 filesystems, but it could also be used to add - journal support to other file systems or block devices such - as RAID or LVM. - - If you are using ext4 or OCFS2, you need to say Y here. - If you are not using ext4 or OCFS2 then you will - probably want to say N. - - To compile this device as a module, choose M here. The module will be - called jbd2. If you are compiling ext4 or OCFS2 into the kernel, - you cannot compile this code as a module. - -config JBD2_DEBUG - bool "JBD2 (ext4) debugging support" - depends on JBD2 && DEBUG_FS - help - If you are using the ext4 journaled file system (or - potentially any other filesystem/device using JBD2), this option - allows you to enable debugging output while the system is running, - in order to help track down any problems you are having. - By default, the debugging output will be turned off. - - If you select Y here, then you will be able to turn on debugging - with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a - number between 1 and 5. The higher the number, the more debugging - output is generated. To turn debugging off again, do - "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug". +source "fs/jbd/Kconfig" +source "fs/jbd2/Kconfig" config FS_MBCACHE # Meta block cache for Extended Attributes (ext2/ext3/ext4) @@ -1168,195 +906,7 @@ config EFS_FS To compile the EFS file system support as a module, choose M here: the module will be called efs. -config JFFS2_FS - tristate "Journalling Flash File System v2 (JFFS2) support" - select CRC32 - depends on MTD - help - JFFS2 is the second generation of the Journalling Flash File System - for use on diskless embedded devices. It provides improved wear - levelling, compression and support for hard links. You cannot use - this on normal block devices, only on 'MTD' devices. - - Further information on the design and implementation of JFFS2 is - available at <http://sources.redhat.com/jffs2/>. - -config JFFS2_FS_DEBUG - int "JFFS2 debugging verbosity (0 = quiet, 2 = noisy)" - depends on JFFS2_FS - default "0" - help - This controls the amount of debugging messages produced by the JFFS2 - code. Set it to zero for use in production systems. For evaluation, - testing and debugging, it's advisable to set it to one. This will - enable a few assertions and will print debugging messages at the - KERN_DEBUG loglevel, where they won't normally be visible. Level 2 - is unlikely to be useful - it enables extra debugging in certain - areas which at one point needed debugging, but when the bugs were - located and fixed, the detailed messages were relegated to level 2. - - If reporting bugs, please try to have available a full dump of the - messages at debug level 1 while the misbehaviour was occurring. - -config JFFS2_FS_WRITEBUFFER - bool "JFFS2 write-buffering support" - depends on JFFS2_FS - default y - help - This enables the write-buffering support in JFFS2. - - This functionality is required to support JFFS2 on the following - types of flash devices: - - NAND flash - - NOR flash with transparent ECC - - DataFlash - -config JFFS2_FS_WBUF_VERIFY - bool "Verify JFFS2 write-buffer reads" - depends on JFFS2_FS_WRITEBUFFER - default n - help - This causes JFFS2 to read back every page written through the - write-buffer, and check for errors. - -config JFFS2_SUMMARY - bool "JFFS2 summary support (EXPERIMENTAL)" - depends on JFFS2_FS && EXPERIMENTAL - default n - help - This feature makes it possible to use summary information - for faster filesystem mount. - - The summary information can be inserted into a filesystem image - by the utility 'sumtool'. - - If unsure, say 'N'. - -config JFFS2_FS_XATTR - bool "JFFS2 XATTR support (EXPERIMENTAL)" - depends on JFFS2_FS && EXPERIMENTAL - default n - help - Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page, or visit - <http://acl.bestbits.at/> for details). - - If unsure, say N. - -config JFFS2_FS_POSIX_ACL - bool "JFFS2 POSIX Access Control Lists" - depends on JFFS2_FS_XATTR - default y - select FS_POSIX_ACL - help - Posix Access Control Lists (ACLs) support permissions for users and - groups beyond the owner/group/world scheme. - - To learn more about Access Control Lists, visit the Posix ACLs for - Linux website <http://acl.bestbits.at/>. - - If you don't know what Access Control Lists are, say N - -config JFFS2_FS_SECURITY - bool "JFFS2 Security Labels" - depends on JFFS2_FS_XATTR - default y - help - Security labels support alternative access control models - implemented by security modules like SELinux. This option - enables an extended attribute handler for file security - labels in the jffs2 filesystem. - - If you are not using a security module that requires using - extended attributes for file security labels, say N. - -config JFFS2_COMPRESSION_OPTIONS - bool "Advanced compression options for JFFS2" - depends on JFFS2_FS - default n - help - Enabling this option allows you to explicitly choose which - compression modules, if any, are enabled in JFFS2. Removing - compressors can mean you cannot read existing file systems, - and enabling experimental compressors can mean that you - write a file system which cannot be read by a standard kernel. - - If unsure, you should _definitely_ say 'N'. - -config JFFS2_ZLIB - bool "JFFS2 ZLIB compression support" if JFFS2_COMPRESSION_OPTIONS - select ZLIB_INFLATE - select ZLIB_DEFLATE - depends on JFFS2_FS - default y - help - Zlib is designed to be a free, general-purpose, legally unencumbered, - lossless data-compression library for use on virtually any computer - hardware and operating system. See <http://www.gzip.org/zlib/> for - further information. - - Say 'Y' if unsure. - -config JFFS2_LZO - bool "JFFS2 LZO compression support" if JFFS2_COMPRESSION_OPTIONS - select LZO_COMPRESS - select LZO_DECOMPRESS - depends on JFFS2_FS - default n - help - minilzo-based compression. Generally works better than Zlib. - - This feature was added in July, 2007. Say 'N' if you need - compatibility with older bootloaders or kernels. - -config JFFS2_RTIME - bool "JFFS2 RTIME compression support" if JFFS2_COMPRESSION_OPTIONS - depends on JFFS2_FS - default y - help - Rtime does manage to recompress already-compressed data. Say 'Y' if unsure. - -config JFFS2_RUBIN - bool "JFFS2 RUBIN compression support" if JFFS2_COMPRESSION_OPTIONS - depends on JFFS2_FS - default n - help - RUBINMIPS and DYNRUBIN compressors. Say 'N' if unsure. - -choice - prompt "JFFS2 default compression mode" if JFFS2_COMPRESSION_OPTIONS - default JFFS2_CMODE_PRIORITY - depends on JFFS2_FS - help - You can set here the default compression mode of JFFS2 from - the available compression modes. Don't touch if unsure. - -config JFFS2_CMODE_NONE - bool "no compression" - help - Uses no compression. - -config JFFS2_CMODE_PRIORITY - bool "priority" - help - Tries the compressors in a predefined order and chooses the first - successful one. - -config JFFS2_CMODE_SIZE - bool "size (EXPERIMENTAL)" - help - Tries all compressors and chooses the one which has the smallest - result. - -config JFFS2_CMODE_FAVOURLZO - bool "Favour LZO" - help - Tries all compressors and chooses the one which has the smallest - result but gives some preference to LZO (which has faster - decompression) at the expense of size. - -endchoice - +source "fs/jffs2/Kconfig" # UBIFS File system configuration source "fs/ubifs/Kconfig" @@ -1913,148 +1463,7 @@ config SMB_NLS_REMOTE smbmount from samba 2.2.0 or later supports this. -config CIFS - tristate "CIFS support (advanced network filesystem, SMBFS successor)" - depends on INET - select NLS - help - This is the client VFS module for the Common Internet File System - (CIFS) protocol which is the successor to the Server Message Block - (SMB) protocol, the native file sharing mechanism for most early - PC operating systems. The CIFS protocol is fully supported by - file servers such as Windows 2000 (including Windows 2003, NT 4 - and Windows XP) as well by Samba (which provides excellent CIFS - server support for Linux and many other operating systems). Limited - support for OS/2 and Windows ME and similar servers is provided as - well. - - The cifs module provides an advanced network file system - client for mounting to CIFS compliant servers. It includes - support for DFS (hierarchical name space), secure per-user - session establishment via Kerberos or NTLM or NTLMv2, - safe distributed caching (oplock), optional packet - signing, Unicode and other internationalization improvements. - If you need to mount to Samba or Windows from this machine, say Y. - -config CIFS_STATS - bool "CIFS statistics" - depends on CIFS - help - Enabling this option will cause statistics for each server share - mounted by the cifs client to be displayed in /proc/fs/cifs/Stats - -config CIFS_STATS2 - bool "Extended statistics" - depends on CIFS_STATS - help - Enabling this option will allow more detailed statistics on SMB - request timing to be displayed in /proc/fs/cifs/DebugData and also - allow optional logging of slow responses to dmesg (depending on the - value of /proc/fs/cifs/cifsFYI, see fs/cifs/README for more details). - These additional statistics may have a minor effect on performance - and memory utilization. - - Unless you are a developer or are doing network performance analysis - or tuning, say N. - -config CIFS_WEAK_PW_HASH - bool "Support legacy servers which use weaker LANMAN security" - depends on CIFS - help - Modern CIFS servers including Samba and most Windows versions - (since 1997) support stronger NTLM (and even NTLMv2 and Kerberos) - security mechanisms. These hash the password more securely - than the mechanisms used in the older LANMAN version of the - SMB protocol but LANMAN based authentication is needed to - establish sessions with some old SMB servers. - - Enabling this option allows the cifs module to mount to older - LANMAN based servers such as OS/2 and Windows 95, but such - mounts may be less secure than mounts using NTLM or more recent - security mechanisms if you are on a public network. Unless you - have a need to access old SMB servers (and are on a private - network) you probably want to say N. Even if this support - is enabled in the kernel build, LANMAN authentication will not be - used automatically. At runtime LANMAN mounts are disabled but - can be set to required (or optional) either in - /proc/fs/cifs (see fs/cifs/README for more detail) or via an - option on the mount command. This support is disabled by - default in order to reduce the possibility of a downgrade - attack. - - If unsure, say N. - -config CIFS_UPCALL - bool "Kerberos/SPNEGO advanced session setup" - depends on CIFS && KEYS - help - Enables an upcall mechanism for CIFS which accesses - userspace helper utilities to provide SPNEGO packaged (RFC 4178) - Kerberos tickets which are needed to mount to certain secure servers - (for which more secure Kerberos authentication is required). If - unsure, say N. - -config CIFS_XATTR - bool "CIFS extended attributes" - depends on CIFS - help - Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page, or visit - <http://acl.bestbits.at/> for details). CIFS maps the name of - extended attributes beginning with the user namespace prefix - to SMB/CIFS EAs. EAs are stored on Windows servers without the - user namespace prefix, but their names are seen by Linux cifs clients - prefaced by the user namespace prefix. The system namespace - (used by some filesystems to store ACLs) is not supported at - this time. - - If unsure, say N. - -config CIFS_POSIX - bool "CIFS POSIX Extensions" - depends on CIFS_XATTR - help - Enabling this option will cause the cifs client to attempt to - negotiate a newer dialect with servers, such as Samba 3.0.5 - or later, that optionally can handle more POSIX like (rather - than Windows like) file behavior. It also enables - support for POSIX ACLs (getfacl and setfacl) to servers - (such as Samba 3.10 and later) which can negotiate - CIFS POSIX ACL support. If unsure, say N. - -config CIFS_DEBUG2 - bool "Enable additional CIFS debugging routines" - depends on CIFS - help - Enabling this option adds a few more debugging routines - to the cifs code which slightly increases the size of - the cifs module and can cause additional logging of debug - messages in some error paths, slowing performance. This - option can be turned off unless you are debugging - cifs problems. If unsure, say N. - -config CIFS_EXPERIMENTAL - bool "CIFS Experimental Features (EXPERIMENTAL)" - depends on CIFS && EXPERIMENTAL - help - Enables cifs features under testing. These features are - experimental and currently include DFS support and directory - change notification ie fcntl(F_DNOTIFY), as well as the upcall - mechanism which will be used for Kerberos session negotiation - and uid remapping. Some of these features also may depend on - setting a value of 1 to the pseudo-file /proc/fs/cifs/Experimental - (which is disabled by default). See the file fs/cifs/README - for more details. If unsure, say N. - -config CIFS_DFS_UPCALL - bool "DFS feature support (EXPERIMENTAL)" - depends on CIFS_EXPERIMENTAL - depends on KEYS - help - Enables an upcall mechanism for CIFS which contacts userspace - helper utilities to provide server name resolution (host names to - IP addresses) which is needed for implicit mounts of DFS junction - points. If unsure, say N. +source "fs/cifs/Kconfig" config NCP_FS tristate "NCP file system support (to mount NetWare volumes)" diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 17c9c5ec14c5..ce9fb3fbfae4 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -25,7 +25,7 @@ config BINFMT_ELF config COMPAT_BINFMT_ELF bool - depends on COMPAT && MMU + depends on COMPAT && BINFMT_ELF config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" @@ -40,6 +40,28 @@ config BINFMT_ELF_FDPIC It is also possible to run FDPIC ELF binaries on MMU linux also. +config CORE_DUMP_DEFAULT_ELF_HEADERS + bool "Write ELF core dumps with partial segments" + default n + depends on BINFMT_ELF + help + ELF core dump files describe each memory mapping of the crashed + process, and can contain or omit the memory contents of each one. + The contents of an unmodified text mapping are omitted by default. + + For an unmodified text mapping of an ELF object, including just + the first page of the file in a core dump makes it possible to + identify the build ID bits in the file, without paying the i/o + cost and disk space to dump all the text. However, versions of + GDB before 6.7 are confused by ELF core dump files in this format. + + The core dump behavior can be controlled per process using + the /proc/PID/coredump_filter pseudo-file; this setting is + inherited. See Documentation/filesystems/proc.txt for details. + + This config option changes the default setting of coredump_filter + seen at boot time. If unsure, say N. + config BINFMT_FLAT bool "Kernel support for flat binaries" depends on !MMU && (!FRV || BROKEN) diff --git a/fs/Makefile b/fs/Makefile index b6f27dc26b72..2168c902d5ca 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -8,7 +8,7 @@ obj-y := open.o read_write.o file_table.o super.o \ char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \ ioctl.o readdir.o select.o fifo.o dcache.o inode.o \ - attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \ + attr.o bad_inode.o file.o filesystems.o namespace.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o drop_caches.o splice.o sync.o utimes.o \ stack.o @@ -27,6 +27,7 @@ obj-$(CONFIG_ANON_INODES) += anon_inodes.o obj-$(CONFIG_SIGNALFD) += signalfd.o obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o +obj-$(CONFIG_AIO) += aio.o obj-$(CONFIG_FILE_LOCKING) += locks.o obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o @@ -70,7 +71,7 @@ obj-$(CONFIG_DLM) += dlm/ # Do not add any filesystems before this line obj-$(CONFIG_REISERFS_FS) += reiserfs/ obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 -obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4dev +obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4 obj-$(CONFIG_JBD) += jbd/ obj-$(CONFIG_JBD2) += jbd2/ obj-$(CONFIG_EXT2_FS) += ext2/ diff --git a/fs/afs/file.c b/fs/afs/file.c index 525f7c56e068..a3901769a96c 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -50,8 +50,8 @@ const struct address_space_operations afs_fs_aops = { .launder_page = afs_launder_page, .releasepage = afs_releasepage, .invalidatepage = afs_invalidatepage, - .prepare_write = afs_prepare_write, - .commit_write = afs_commit_write, + .write_begin = afs_write_begin, + .write_end = afs_write_end, .writepage = afs_writepage, .writepages = afs_writepages, }; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 3cb6920ff30b..67f259d99cd6 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -728,8 +728,12 @@ extern int afs_volume_release_fileserver(struct afs_vnode *, */ extern int afs_set_page_dirty(struct page *); extern void afs_put_writeback(struct afs_writeback *); -extern int afs_prepare_write(struct file *, struct page *, unsigned, unsigned); -extern int afs_commit_write(struct file *, struct page *, unsigned, unsigned); +extern int afs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata); +extern int afs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata); extern int afs_writepage(struct page *, struct writeback_control *); extern int afs_writepages(struct address_space *, struct writeback_control *); extern int afs_write_inode(struct inode *, int); diff --git a/fs/afs/write.c b/fs/afs/write.c index 065b4e10681a..d6b85dab35fc 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -84,15 +84,23 @@ void afs_put_writeback(struct afs_writeback *wb) * partly or wholly fill a page that's under preparation for writing */ static int afs_fill_page(struct afs_vnode *vnode, struct key *key, - unsigned start, unsigned len, struct page *page) + loff_t pos, unsigned len, struct page *page) { + loff_t i_size; + unsigned eof; int ret; - _enter(",,%u,%u", start, len); + _enter(",,%llu,%u", (unsigned long long)pos, len); - ASSERTCMP(start + len, <=, PAGE_SIZE); + ASSERTCMP(len, <=, PAGE_CACHE_SIZE); - ret = afs_vnode_fetch_data(vnode, key, start, len, page); + i_size = i_size_read(&vnode->vfs_inode); + if (pos + len > i_size) + eof = i_size; + else + eof = PAGE_CACHE_SIZE; + + ret = afs_vnode_fetch_data(vnode, key, 0, eof, page); if (ret < 0) { if (ret == -ENOENT) { _debug("got NOENT from server" @@ -107,109 +115,55 @@ static int afs_fill_page(struct afs_vnode *vnode, struct key *key, } /* - * prepare a page for being written to - */ -static int afs_prepare_page(struct afs_vnode *vnode, struct page *page, - struct key *key, unsigned offset, unsigned to) -{ - unsigned eof, tail, start, stop, len; - loff_t i_size, pos; - void *p; - int ret; - - _enter(""); - - if (offset == 0 && to == PAGE_SIZE) - return 0; - - p = kmap_atomic(page, KM_USER0); - - i_size = i_size_read(&vnode->vfs_inode); - pos = (loff_t) page->index << PAGE_SHIFT; - if (pos >= i_size) { - /* partial write, page beyond EOF */ - _debug("beyond"); - if (offset > 0) - memset(p, 0, offset); - if (to < PAGE_SIZE) - memset(p + to, 0, PAGE_SIZE - to); - kunmap_atomic(p, KM_USER0); - return 0; - } - - if (i_size - pos >= PAGE_SIZE) { - /* partial write, page entirely before EOF */ - _debug("before"); - tail = eof = PAGE_SIZE; - } else { - /* partial write, page overlaps EOF */ - eof = i_size - pos; - _debug("overlap %u", eof); - tail = max(eof, to); - if (tail < PAGE_SIZE) - memset(p + tail, 0, PAGE_SIZE - tail); - if (offset > eof) - memset(p + eof, 0, PAGE_SIZE - eof); - } - - kunmap_atomic(p, KM_USER0); - - ret = 0; - if (offset > 0 || eof > to) { - /* need to fill one or two bits that aren't going to be written - * (cover both fillers in one read if there are two) */ - start = (offset > 0) ? 0 : to; - stop = (eof > to) ? eof : offset; - len = stop - start; - _debug("wr=%u-%u av=0-%u rd=%u@%u", - offset, to, eof, start, len); - ret = afs_fill_page(vnode, key, start, len, page); - } - - _leave(" = %d", ret); - return ret; -} - -/* * prepare to perform part of a write to a page - * - the caller holds the page locked, preventing it from being written out or - * modified by anyone else */ -int afs_prepare_write(struct file *file, struct page *page, - unsigned offset, unsigned to) +int afs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) { struct afs_writeback *candidate, *wb; struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode); + struct page *page; struct key *key = file->private_data; - pgoff_t index; + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + unsigned to = from + len; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; int ret; _enter("{%x:%u},{%lx},%u,%u", - vnode->fid.vid, vnode->fid.vnode, page->index, offset, to); + vnode->fid.vid, vnode->fid.vnode, index, from, to); candidate = kzalloc(sizeof(*candidate), GFP_KERNEL); if (!candidate) return -ENOMEM; candidate->vnode = vnode; - candidate->first = candidate->last = page->index; - candidate->offset_first = offset; + candidate->first = candidate->last = index; + candidate->offset_first = from; candidate->to_last = to; candidate->usage = 1; candidate->state = AFS_WBACK_PENDING; init_waitqueue_head(&candidate->waitq); + page = __grab_cache_page(mapping, index); + if (!page) { + kfree(candidate); + return -ENOMEM; + } + *pagep = page; + /* page won't leak in error case: it eventually gets cleaned off LRU */ + if (!PageUptodate(page)) { _debug("not up to date"); - ret = afs_prepare_page(vnode, page, key, offset, to); + ret = afs_fill_page(vnode, key, pos, len, page); if (ret < 0) { kfree(candidate); _leave(" = %d [prep]", ret); return ret; } + SetPageUptodate(page); } try_again: - index = page->index; spin_lock(&vnode->writeback_lock); /* see if this page is already pending a writeback under a suitable key @@ -242,8 +196,8 @@ try_again: subsume_in_current_wb: _debug("subsume"); ASSERTRANGE(wb->first, <=, index, <=, wb->last); - if (index == wb->first && offset < wb->offset_first) - wb->offset_first = offset; + if (index == wb->first && from < wb->offset_first) + wb->offset_first = from; if (index == wb->last && to > wb->to_last) wb->to_last = to; spin_unlock(&vnode->writeback_lock); @@ -289,17 +243,17 @@ flush_conflicting_wb: /* * finalise part of a write to a page */ -int afs_commit_write(struct file *file, struct page *page, - unsigned offset, unsigned to) +int afs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode); loff_t i_size, maybe_i_size; - _enter("{%x:%u},{%lx},%u,%u", - vnode->fid.vid, vnode->fid.vnode, page->index, offset, to); + _enter("{%x:%u},{%lx}", + vnode->fid.vid, vnode->fid.vnode, page->index); - maybe_i_size = (loff_t) page->index << PAGE_SHIFT; - maybe_i_size += to; + maybe_i_size = pos + copied; i_size = i_size_read(&vnode->vfs_inode); if (maybe_i_size > i_size) { @@ -310,12 +264,13 @@ int afs_commit_write(struct file *file, struct page *page, spin_unlock(&vnode->writeback_lock); } - SetPageUptodate(page); set_page_dirty(page); if (PageDirty(page)) _debug("dirtied"); + unlock_page(page); + page_cache_release(page); - return 0; + return copied; } /* diff --git a/fs/autofs4/Makefile b/fs/autofs4/Makefile index f2c3b79e94d2..a811c1f7d9ab 100644 --- a/fs/autofs4/Makefile +++ b/fs/autofs4/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_AUTOFS4_FS) += autofs4.o -autofs4-objs := init.o inode.o root.o symlink.o waitq.o expire.o +autofs4-objs := init.o inode.o root.o symlink.o waitq.o expire.o dev-ioctl.o diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 69a2f5c92319..e0f16da00e54 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -14,6 +14,7 @@ /* Internal header file for autofs */ #include <linux/auto_fs4.h> +#include <linux/auto_dev-ioctl.h> #include <linux/mutex.h> #include <linux/list.h> @@ -21,6 +22,11 @@ #define AUTOFS_IOC_FIRST AUTOFS_IOC_READY #define AUTOFS_IOC_COUNT 32 +#define AUTOFS_DEV_IOCTL_IOC_FIRST (AUTOFS_DEV_IOCTL_VERSION) +#define AUTOFS_DEV_IOCTL_IOC_COUNT (AUTOFS_IOC_COUNT - 11) + +#define AUTOFS_TYPE_TRIGGER (AUTOFS_TYPE_DIRECT|AUTOFS_TYPE_OFFSET) + #include <linux/kernel.h> #include <linux/slab.h> #include <linux/time.h> @@ -35,11 +41,27 @@ /* #define DEBUG */ #ifdef DEBUG -#define DPRINTK(fmt,args...) do { printk(KERN_DEBUG "pid %d: %s: " fmt "\n" , current->pid , __func__ , ##args); } while(0) +#define DPRINTK(fmt, args...) \ +do { \ + printk(KERN_DEBUG "pid %d: %s: " fmt "\n", \ + current->pid, __func__, ##args); \ +} while (0) #else -#define DPRINTK(fmt,args...) do {} while(0) +#define DPRINTK(fmt, args...) do {} while (0) #endif +#define AUTOFS_WARN(fmt, args...) \ +do { \ + printk(KERN_WARNING "pid %d: %s: " fmt "\n", \ + current->pid, __func__, ##args); \ +} while (0) + +#define AUTOFS_ERROR(fmt, args...) \ +do { \ + printk(KERN_ERR "pid %d: %s: " fmt "\n", \ + current->pid, __func__, ##args); \ +} while (0) + /* Unified info structure. This is pointed to by both the dentry and inode structures. Each file in the filesystem has an instance of this structure. It holds a reference to the dentry, so dentries are never @@ -61,6 +83,9 @@ struct autofs_info { unsigned long last_used; atomic_t count; + uid_t uid; + gid_t gid; + mode_t mode; size_t size; @@ -92,10 +117,6 @@ struct autofs_wait_queue { #define AUTOFS_SBI_MAGIC 0x6d4a556d -#define AUTOFS_TYPE_INDIRECT 0x0001 -#define AUTOFS_TYPE_DIRECT 0x0002 -#define AUTOFS_TYPE_OFFSET 0x0004 - struct autofs_sb_info { u32 magic; int pipefd; @@ -169,6 +190,17 @@ int autofs4_expire_run(struct super_block *, struct vfsmount *, struct autofs_packet_expire __user *); int autofs4_expire_multi(struct super_block *, struct vfsmount *, struct autofs_sb_info *, int __user *); +struct dentry *autofs4_expire_direct(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, int how); +struct dentry *autofs4_expire_indirect(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, int how); + +/* Device node initialization */ + +int autofs_dev_ioctl_init(void); +void autofs_dev_ioctl_exit(void); /* Operations structures */ diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c new file mode 100644 index 000000000000..625abf5422e2 --- /dev/null +++ b/fs/autofs4/dev-ioctl.c @@ -0,0 +1,863 @@ +/* + * Copyright 2008 Red Hat, Inc. All rights reserved. + * Copyright 2008 Ian Kent <raven@themaw.net> + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + */ + +#include <linux/module.h> +#include <linux/vmalloc.h> +#include <linux/miscdevice.h> +#include <linux/init.h> +#include <linux/wait.h> +#include <linux/namei.h> +#include <linux/fcntl.h> +#include <linux/file.h> +#include <linux/fdtable.h> +#include <linux/sched.h> +#include <linux/compat.h> +#include <linux/syscalls.h> +#include <linux/smp_lock.h> +#include <linux/magic.h> +#include <linux/dcache.h> +#include <linux/uaccess.h> + +#include "autofs_i.h" + +/* + * This module implements an interface for routing autofs ioctl control + * commands via a miscellaneous device file. + * + * The alternate interface is needed because we need to be able open + * an ioctl file descriptor on an autofs mount that may be covered by + * another mount. This situation arises when starting automount(8) + * or other user space daemon which uses direct mounts or offset + * mounts (used for autofs lazy mount/umount of nested mount trees), + * which have been left busy at at service shutdown. + */ + +#define AUTOFS_DEV_IOCTL_SIZE sizeof(struct autofs_dev_ioctl) + +typedef int (*ioctl_fn)(struct file *, struct autofs_sb_info *, + struct autofs_dev_ioctl *); + +static int check_name(const char *name) +{ + if (!strchr(name, '/')) + return -EINVAL; + return 0; +} + +/* + * Check a string doesn't overrun the chunk of + * memory we copied from user land. + */ +static int invalid_str(char *str, void *end) +{ + while ((void *) str <= end) + if (!*str++) + return 0; + return -EINVAL; +} + +/* + * Check that the user compiled against correct version of autofs + * misc device code. + * + * As well as checking the version compatibility this always copies + * the kernel interface version out. + */ +static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param) +{ + int err = 0; + + if ((AUTOFS_DEV_IOCTL_VERSION_MAJOR != param->ver_major) || + (AUTOFS_DEV_IOCTL_VERSION_MINOR < param->ver_minor)) { + AUTOFS_WARN("ioctl control interface version mismatch: " + "kernel(%u.%u), user(%u.%u), cmd(%d)", + AUTOFS_DEV_IOCTL_VERSION_MAJOR, + AUTOFS_DEV_IOCTL_VERSION_MINOR, + param->ver_major, param->ver_minor, cmd); + err = -EINVAL; + } + + /* Fill in the kernel version. */ + param->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR; + param->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR; + + return err; +} + +/* + * Copy parameter control struct, including a possible path allocated + * at the end of the struct. + */ +static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in) +{ + struct autofs_dev_ioctl tmp, *ads; + + if (copy_from_user(&tmp, in, sizeof(tmp))) + return ERR_PTR(-EFAULT); + + if (tmp.size < sizeof(tmp)) + return ERR_PTR(-EINVAL); + + ads = kmalloc(tmp.size, GFP_KERNEL); + if (!ads) + return ERR_PTR(-ENOMEM); + + if (copy_from_user(ads, in, tmp.size)) { + kfree(ads); + return ERR_PTR(-EFAULT); + } + + return ads; +} + +static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) +{ + kfree(param); + return; +} + +/* + * Check sanity of parameter control fields and if a path is present + * check that it has a "/" and is terminated. + */ +static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) +{ + int err = -EINVAL; + + if (check_dev_ioctl_version(cmd, param)) { + AUTOFS_WARN("invalid device control module version " + "supplied for cmd(0x%08x)", cmd); + goto out; + } + + if (param->size > sizeof(*param)) { + err = check_name(param->path); + if (err) { + AUTOFS_WARN("invalid path supplied for cmd(0x%08x)", + cmd); + goto out; + } + + err = invalid_str(param->path, + (void *) ((size_t) param + param->size)); + if (err) { + AUTOFS_WARN("invalid path supplied for cmd(0x%08x)", + cmd); + goto out; + } + } + + err = 0; +out: + return err; +} + +/* + * Get the autofs super block info struct from the file opened on + * the autofs mount point. + */ +static struct autofs_sb_info *autofs_dev_ioctl_sbi(struct file *f) +{ + struct autofs_sb_info *sbi = NULL; + struct inode *inode; + + if (f) { + inode = f->f_path.dentry->d_inode; + sbi = autofs4_sbi(inode->i_sb); + } + return sbi; +} + +/* Return autofs module protocol version */ +static int autofs_dev_ioctl_protover(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + param->arg1 = sbi->version; + return 0; +} + +/* Return autofs module protocol sub version */ +static int autofs_dev_ioctl_protosubver(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + param->arg1 = sbi->sub_version; + return 0; +} + +/* + * Walk down the mount stack looking for an autofs mount that + * has the requested device number (aka. new_encode_dev(sb->s_dev). + */ +static int autofs_dev_ioctl_find_super(struct nameidata *nd, dev_t devno) +{ + struct dentry *dentry; + struct inode *inode; + struct super_block *sb; + dev_t s_dev; + unsigned int err; + + err = -ENOENT; + + /* Lookup the dentry name at the base of our mount point */ + dentry = d_lookup(nd->path.dentry, &nd->last); + if (!dentry) + goto out; + + dput(nd->path.dentry); + nd->path.dentry = dentry; + + /* And follow the mount stack looking for our autofs mount */ + while (follow_down(&nd->path.mnt, &nd->path.dentry)) { + inode = nd->path.dentry->d_inode; + if (!inode) + break; + + sb = inode->i_sb; + s_dev = new_encode_dev(sb->s_dev); + if (devno == s_dev) { + if (sb->s_magic == AUTOFS_SUPER_MAGIC) { + err = 0; + break; + } + } + } +out: + return err; +} + +/* + * Walk down the mount stack looking for an autofs mount that + * has the requested mount type (ie. indirect, direct or offset). + */ +static int autofs_dev_ioctl_find_sbi_type(struct nameidata *nd, unsigned int type) +{ + struct dentry *dentry; + struct autofs_info *ino; + unsigned int err; + + err = -ENOENT; + + /* Lookup the dentry name at the base of our mount point */ + dentry = d_lookup(nd->path.dentry, &nd->last); + if (!dentry) + goto out; + + dput(nd->path.dentry); + nd->path.dentry = dentry; + + /* And follow the mount stack looking for our autofs mount */ + while (follow_down(&nd->path.mnt, &nd->path.dentry)) { + ino = autofs4_dentry_ino(nd->path.dentry); + if (ino && ino->sbi->type & type) { + err = 0; + break; + } + } +out: + return err; +} + +static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file) +{ + struct files_struct *files = current->files; + struct fdtable *fdt; + + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + BUG_ON(fdt->fd[fd] != NULL); + rcu_assign_pointer(fdt->fd[fd], file); + FD_SET(fd, fdt->close_on_exec); + spin_unlock(&files->file_lock); +} + + +/* + * Open a file descriptor on the autofs mount point corresponding + * to the given path and device number (aka. new_encode_dev(sb->s_dev)). + */ +static int autofs_dev_ioctl_open_mountpoint(const char *path, dev_t devid) +{ + struct file *filp; + struct nameidata nd; + int err, fd; + + fd = get_unused_fd(); + if (likely(fd >= 0)) { + /* Get nameidata of the parent directory */ + err = path_lookup(path, LOOKUP_PARENT, &nd); + if (err) + goto out; + + /* + * Search down, within the parent, looking for an + * autofs super block that has the device number + * corresponding to the autofs fs we want to open. + */ + err = autofs_dev_ioctl_find_super(&nd, devid); + if (err) { + path_put(&nd.path); + goto out; + } + + filp = dentry_open(nd.path.dentry, nd.path.mnt, O_RDONLY); + if (IS_ERR(filp)) { + err = PTR_ERR(filp); + goto out; + } + + autofs_dev_ioctl_fd_install(fd, filp); + } + + return fd; + +out: + put_unused_fd(fd); + return err; +} + +/* Open a file descriptor on an autofs mount point */ +static int autofs_dev_ioctl_openmount(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + const char *path; + dev_t devid; + int err, fd; + + /* param->path has already been checked */ + if (!param->arg1) + return -EINVAL; + + param->ioctlfd = -1; + + path = param->path; + devid = param->arg1; + + err = 0; + fd = autofs_dev_ioctl_open_mountpoint(path, devid); + if (unlikely(fd < 0)) { + err = fd; + goto out; + } + + param->ioctlfd = fd; +out: + return err; +} + +/* Close file descriptor allocated above (user can also use close(2)). */ +static int autofs_dev_ioctl_closemount(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + return sys_close(param->ioctlfd); +} + +/* + * Send "ready" status for an existing wait (either a mount or an expire + * request). + */ +static int autofs_dev_ioctl_ready(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + autofs_wqt_t token; + + token = (autofs_wqt_t) param->arg1; + return autofs4_wait_release(sbi, token, 0); +} + +/* + * Send "fail" status for an existing wait (either a mount or an expire + * request). + */ +static int autofs_dev_ioctl_fail(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + autofs_wqt_t token; + int status; + + token = (autofs_wqt_t) param->arg1; + status = param->arg2 ? param->arg2 : -ENOENT; + return autofs4_wait_release(sbi, token, status); +} + +/* + * Set the pipe fd for kernel communication to the daemon. + * + * Normally this is set at mount using an option but if we + * are reconnecting to a busy mount then we need to use this + * to tell the autofs mount about the new kernel pipe fd. In + * order to protect mounts against incorrectly setting the + * pipefd we also require that the autofs mount be catatonic. + * + * This also sets the process group id used to identify the + * controlling process (eg. the owning automount(8) daemon). + */ +static int autofs_dev_ioctl_setpipefd(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + int pipefd; + int err = 0; + + if (param->arg1 == -1) + return -EINVAL; + + pipefd = param->arg1; + + mutex_lock(&sbi->wq_mutex); + if (!sbi->catatonic) { + mutex_unlock(&sbi->wq_mutex); + return -EBUSY; + } else { + struct file *pipe = fget(pipefd); + if (!pipe->f_op || !pipe->f_op->write) { + err = -EPIPE; + fput(pipe); + goto out; + } + sbi->oz_pgrp = task_pgrp_nr(current); + sbi->pipefd = pipefd; + sbi->pipe = pipe; + sbi->catatonic = 0; + } +out: + mutex_unlock(&sbi->wq_mutex); + return err; +} + +/* + * Make the autofs mount point catatonic, no longer responsive to + * mount requests. Also closes the kernel pipe file descriptor. + */ +static int autofs_dev_ioctl_catatonic(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + autofs4_catatonic_mode(sbi); + return 0; +} + +/* Set the autofs mount timeout */ +static int autofs_dev_ioctl_timeout(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + unsigned long timeout; + + timeout = param->arg1; + param->arg1 = sbi->exp_timeout / HZ; + sbi->exp_timeout = timeout * HZ; + return 0; +} + +/* + * Return the uid and gid of the last request for the mount + * + * When reconstructing an autofs mount tree with active mounts + * we need to re-connect to mounts that may have used the original + * process uid and gid (or string variations of them) for mount + * lookups within the map entry. + */ +static int autofs_dev_ioctl_requester(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + struct autofs_info *ino; + struct nameidata nd; + const char *path; + dev_t devid; + int err = -ENOENT; + + if (param->size <= sizeof(*param)) { + err = -EINVAL; + goto out; + } + + path = param->path; + devid = sbi->sb->s_dev; + + param->arg1 = param->arg2 = -1; + + /* Get nameidata of the parent directory */ + err = path_lookup(path, LOOKUP_PARENT, &nd); + if (err) + goto out; + + err = autofs_dev_ioctl_find_super(&nd, devid); + if (err) + goto out_release; + + ino = autofs4_dentry_ino(nd.path.dentry); + if (ino) { + err = 0; + autofs4_expire_wait(nd.path.dentry); + spin_lock(&sbi->fs_lock); + param->arg1 = ino->uid; + param->arg2 = ino->gid; + spin_unlock(&sbi->fs_lock); + } + +out_release: + path_put(&nd.path); +out: + return err; +} + +/* + * Call repeatedly until it returns -EAGAIN, meaning there's nothing + * more that can be done. + */ +static int autofs_dev_ioctl_expire(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + struct dentry *dentry; + struct vfsmount *mnt; + int err = -EAGAIN; + int how; + + how = param->arg1; + mnt = fp->f_path.mnt; + + if (sbi->type & AUTOFS_TYPE_TRIGGER) + dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how); + else + dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how); + + if (dentry) { + struct autofs_info *ino = autofs4_dentry_ino(dentry); + + /* + * This is synchronous because it makes the daemon a + * little easier + */ + err = autofs4_wait(sbi, dentry, NFY_EXPIRE); + + spin_lock(&sbi->fs_lock); + if (ino->flags & AUTOFS_INF_MOUNTPOINT) { + ino->flags &= ~AUTOFS_INF_MOUNTPOINT; + sbi->sb->s_root->d_mounted++; + } + ino->flags &= ~AUTOFS_INF_EXPIRING; + complete_all(&ino->expire_complete); + spin_unlock(&sbi->fs_lock); + dput(dentry); + } + + return err; +} + +/* Check if autofs mount point is in use */ +static int autofs_dev_ioctl_askumount(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + param->arg1 = 0; + if (may_umount(fp->f_path.mnt)) + param->arg1 = 1; + return 0; +} + +/* + * Check if the given path is a mountpoint. + * + * If we are supplied with the file descriptor of an autofs + * mount we're looking for a specific mount. In this case + * the path is considered a mountpoint if it is itself a + * mountpoint or contains a mount, such as a multi-mount + * without a root mount. In this case we return 1 if the + * path is a mount point and the super magic of the covering + * mount if there is one or 0 if it isn't a mountpoint. + * + * If we aren't supplied with a file descriptor then we + * lookup the nameidata of the path and check if it is the + * root of a mount. If a type is given we are looking for + * a particular autofs mount and if we don't find a match + * we return fail. If the located nameidata path is the + * root of a mount we return 1 along with the super magic + * of the mount or 0 otherwise. + * + * In both cases the the device number (as returned by + * new_encode_dev()) is also returned. + */ +static int autofs_dev_ioctl_ismountpoint(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + struct nameidata nd; + const char *path; + unsigned int type; + int err = -ENOENT; + + if (param->size <= sizeof(*param)) { + err = -EINVAL; + goto out; + } + + path = param->path; + type = param->arg1; + + param->arg1 = 0; + param->arg2 = 0; + + if (!fp || param->ioctlfd == -1) { + if (type == AUTOFS_TYPE_ANY) { + struct super_block *sb; + + err = path_lookup(path, LOOKUP_FOLLOW, &nd); + if (err) + goto out; + + sb = nd.path.dentry->d_sb; + param->arg1 = new_encode_dev(sb->s_dev); + } else { + struct autofs_info *ino; + + err = path_lookup(path, LOOKUP_PARENT, &nd); + if (err) + goto out; + + err = autofs_dev_ioctl_find_sbi_type(&nd, type); + if (err) + goto out_release; + + ino = autofs4_dentry_ino(nd.path.dentry); + param->arg1 = autofs4_get_dev(ino->sbi); + } + + err = 0; + if (nd.path.dentry->d_inode && + nd.path.mnt->mnt_root == nd.path.dentry) { + err = 1; + param->arg2 = nd.path.dentry->d_inode->i_sb->s_magic; + } + } else { + dev_t devid = new_encode_dev(sbi->sb->s_dev); + + err = path_lookup(path, LOOKUP_PARENT, &nd); + if (err) + goto out; + + err = autofs_dev_ioctl_find_super(&nd, devid); + if (err) + goto out_release; + + param->arg1 = autofs4_get_dev(sbi); + + err = have_submounts(nd.path.dentry); + + if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) { + if (follow_down(&nd.path.mnt, &nd.path.dentry)) { + struct inode *inode = nd.path.dentry->d_inode; + param->arg2 = inode->i_sb->s_magic; + } + } + } + +out_release: + path_put(&nd.path); +out: + return err; +} + +/* + * Our range of ioctl numbers isn't 0 based so we need to shift + * the array index by _IOC_NR(AUTOFS_CTL_IOC_FIRST) for the table + * lookup. + */ +#define cmd_idx(cmd) (cmd - _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST)) + +static ioctl_fn lookup_dev_ioctl(unsigned int cmd) +{ + static struct { + int cmd; + ioctl_fn fn; + } _ioctls[] = { + {cmd_idx(AUTOFS_DEV_IOCTL_VERSION_CMD), NULL}, + {cmd_idx(AUTOFS_DEV_IOCTL_PROTOVER_CMD), + autofs_dev_ioctl_protover}, + {cmd_idx(AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD), + autofs_dev_ioctl_protosubver}, + {cmd_idx(AUTOFS_DEV_IOCTL_OPENMOUNT_CMD), + autofs_dev_ioctl_openmount}, + {cmd_idx(AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD), + autofs_dev_ioctl_closemount}, + {cmd_idx(AUTOFS_DEV_IOCTL_READY_CMD), + autofs_dev_ioctl_ready}, + {cmd_idx(AUTOFS_DEV_IOCTL_FAIL_CMD), + autofs_dev_ioctl_fail}, + {cmd_idx(AUTOFS_DEV_IOCTL_SETPIPEFD_CMD), + autofs_dev_ioctl_setpipefd}, + {cmd_idx(AUTOFS_DEV_IOCTL_CATATONIC_CMD), + autofs_dev_ioctl_catatonic}, + {cmd_idx(AUTOFS_DEV_IOCTL_TIMEOUT_CMD), + autofs_dev_ioctl_timeout}, + {cmd_idx(AUTOFS_DEV_IOCTL_REQUESTER_CMD), + autofs_dev_ioctl_requester}, + {cmd_idx(AUTOFS_DEV_IOCTL_EXPIRE_CMD), + autofs_dev_ioctl_expire}, + {cmd_idx(AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD), + autofs_dev_ioctl_askumount}, + {cmd_idx(AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD), + autofs_dev_ioctl_ismountpoint} + }; + unsigned int idx = cmd_idx(cmd); + + return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx].fn; +} + +/* ioctl dispatcher */ +static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __user *user) +{ + struct autofs_dev_ioctl *param; + struct file *fp; + struct autofs_sb_info *sbi; + unsigned int cmd_first, cmd; + ioctl_fn fn = NULL; + int err = 0; + + /* only root can play with this */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + cmd_first = _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST); + cmd = _IOC_NR(command); + + if (_IOC_TYPE(command) != _IOC_TYPE(AUTOFS_DEV_IOCTL_IOC_FIRST) || + cmd - cmd_first >= AUTOFS_DEV_IOCTL_IOC_COUNT) { + return -ENOTTY; + } + + /* Copy the parameters into kernel space. */ + param = copy_dev_ioctl(user); + if (IS_ERR(param)) + return PTR_ERR(param); + + err = validate_dev_ioctl(command, param); + if (err) + goto out; + + /* The validate routine above always sets the version */ + if (cmd == AUTOFS_DEV_IOCTL_VERSION_CMD) + goto done; + + fn = lookup_dev_ioctl(cmd); + if (!fn) { + AUTOFS_WARN("unknown command 0x%08x", command); + return -ENOTTY; + } + + fp = NULL; + sbi = NULL; + + /* + * For obvious reasons the openmount can't have a file + * descriptor yet. We don't take a reference to the + * file during close to allow for immediate release. + */ + if (cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD && + cmd != AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD) { + fp = fget(param->ioctlfd); + if (!fp) { + if (cmd == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD) + goto cont; + err = -EBADF; + goto out; + } + + if (!fp->f_op) { + err = -ENOTTY; + fput(fp); + goto out; + } + + sbi = autofs_dev_ioctl_sbi(fp); + if (!sbi || sbi->magic != AUTOFS_SBI_MAGIC) { + err = -EINVAL; + fput(fp); + goto out; + } + + /* + * Admin needs to be able to set the mount catatonic in + * order to be able to perform the re-open. + */ + if (!autofs4_oz_mode(sbi) && + cmd != AUTOFS_DEV_IOCTL_CATATONIC_CMD) { + err = -EACCES; + fput(fp); + goto out; + } + } +cont: + err = fn(fp, sbi, param); + + if (fp) + fput(fp); +done: + if (err >= 0 && copy_to_user(user, param, AUTOFS_DEV_IOCTL_SIZE)) + err = -EFAULT; +out: + free_dev_ioctl(param); + return err; +} + +static long autofs_dev_ioctl(struct file *file, uint command, ulong u) +{ + int err; + err = _autofs_dev_ioctl(command, (struct autofs_dev_ioctl __user *) u); + return (long) err; +} + +#ifdef CONFIG_COMPAT +static long autofs_dev_ioctl_compat(struct file *file, uint command, ulong u) +{ + return (long) autofs_dev_ioctl(file, command, (ulong) compat_ptr(u)); +} +#else +#define autofs_dev_ioctl_compat NULL +#endif + +static const struct file_operations _dev_ioctl_fops = { + .unlocked_ioctl = autofs_dev_ioctl, + .compat_ioctl = autofs_dev_ioctl_compat, + .owner = THIS_MODULE, +}; + +static struct miscdevice _autofs_dev_ioctl_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = AUTOFS_DEVICE_NAME, + .fops = &_dev_ioctl_fops +}; + +/* Register/deregister misc character device */ +int autofs_dev_ioctl_init(void) +{ + int r; + + r = misc_register(&_autofs_dev_ioctl_misc); + if (r) { + AUTOFS_ERROR("misc_register failed for control device"); + return r; + } + + return 0; +} + +void autofs_dev_ioctl_exit(void) +{ + misc_deregister(&_autofs_dev_ioctl_misc); + return; +} + diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index cdabb796ff01..cde2f8e8935a 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -244,10 +244,10 @@ cont: } /* Check if we can expire a direct mount (possibly a tree) */ -static struct dentry *autofs4_expire_direct(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, - int how) +struct dentry *autofs4_expire_direct(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + int how) { unsigned long timeout; struct dentry *root = dget(sb->s_root); @@ -283,10 +283,10 @@ static struct dentry *autofs4_expire_direct(struct super_block *sb, * - it is unused by any user process * - it has been unused for exp_timeout time */ -static struct dentry *autofs4_expire_indirect(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, - int how) +struct dentry *autofs4_expire_indirect(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + int how) { unsigned long timeout; struct dentry *root = sb->s_root; @@ -479,7 +479,7 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, if (arg && get_user(do_now, arg)) return -EFAULT; - if (sbi->type & AUTOFS_TYPE_DIRECT) + if (sbi->type & AUTOFS_TYPE_TRIGGER) dentry = autofs4_expire_direct(sb, mnt, sbi, do_now); else dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now); diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c index 723a1c5e361b..9722e4bd8957 100644 --- a/fs/autofs4/init.c +++ b/fs/autofs4/init.c @@ -29,11 +29,20 @@ static struct file_system_type autofs_fs_type = { static int __init init_autofs4_fs(void) { - return register_filesystem(&autofs_fs_type); + int err; + + err = register_filesystem(&autofs_fs_type); + if (err) + return err; + + autofs_dev_ioctl_init(); + + return err; } static void __exit exit_autofs4_fs(void) { + autofs_dev_ioctl_exit(); unregister_filesystem(&autofs_fs_type); } diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 45d55819203d..c7e65bb30ba0 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -53,6 +53,8 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *ino, atomic_set(&ino->count, 0); } + ino->uid = 0; + ino->gid = 0; ino->mode = mode; ino->last_used = jiffies; @@ -288,7 +290,7 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, *type = AUTOFS_TYPE_DIRECT; break; case Opt_offset: - *type = AUTOFS_TYPE_DIRECT | AUTOFS_TYPE_OFFSET; + *type = AUTOFS_TYPE_OFFSET; break; default: return 1; @@ -336,7 +338,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) sbi->sb = s; sbi->version = 0; sbi->sub_version = 0; - sbi->type = 0; + sbi->type = AUTOFS_TYPE_INDIRECT; sbi->min_proto = 0; sbi->max_proto = 0; mutex_init(&sbi->wq_mutex); @@ -378,7 +380,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) } root_inode->i_fop = &autofs4_root_operations; - root_inode->i_op = sbi->type & AUTOFS_TYPE_DIRECT ? + root_inode->i_op = sbi->type & AUTOFS_TYPE_TRIGGER ? &autofs4_direct_root_inode_operations : &autofs4_indirect_root_inode_operations; diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 35216d18d8b5..4b67c2a2d77c 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -337,7 +337,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, * is very similar for indirect mounts except only dentrys * in the root of the autofs file system may be negative. */ - if (sbi->type & (AUTOFS_TYPE_DIRECT|AUTOFS_TYPE_OFFSET)) + if (sbi->type & AUTOFS_TYPE_TRIGGER) return -ENOENT; else if (!IS_ROOT(dentry->d_parent)) return -ENOENT; @@ -348,7 +348,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, return -ENOMEM; /* If this is a direct mount request create a dummy name */ - if (IS_ROOT(dentry) && (sbi->type & AUTOFS_TYPE_DIRECT)) + if (IS_ROOT(dentry) && sbi->type & AUTOFS_TYPE_TRIGGER) qstr.len = sprintf(name, "%p", dentry); else { qstr.len = autofs4_getpath(sbi, dentry, &name); @@ -406,11 +406,11 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, type = autofs_ptype_expire_multi; } else { if (notify == NFY_MOUNT) - type = (sbi->type & AUTOFS_TYPE_DIRECT) ? + type = (sbi->type & AUTOFS_TYPE_TRIGGER) ? autofs_ptype_missing_direct : autofs_ptype_missing_indirect; else - type = (sbi->type & AUTOFS_TYPE_DIRECT) ? + type = (sbi->type & AUTOFS_TYPE_TRIGGER) ? autofs_ptype_expire_direct : autofs_ptype_expire_indirect; } @@ -457,6 +457,40 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, status = wq->status; + /* + * For direct and offset mounts we need to track the requester's + * uid and gid in the dentry info struct. This is so it can be + * supplied, on request, by the misc device ioctl interface. + * This is needed during daemon resatart when reconnecting + * to existing, active, autofs mounts. The uid and gid (and + * related string values) may be used for macro substitution + * in autofs mount maps. + */ + if (!status) { + struct autofs_info *ino; + struct dentry *de = NULL; + + /* direct mount or browsable map */ + ino = autofs4_dentry_ino(dentry); + if (!ino) { + /* If not lookup actual dentry used */ + de = d_lookup(dentry->d_parent, &dentry->d_name); + if (de) + ino = autofs4_dentry_ino(de); + } + + /* Set mount requester */ + if (ino) { + spin_lock(&sbi->fs_lock); + ino->uid = wq->uid; + ino->gid = wq->gid; + spin_unlock(&sbi->fs_lock); + } + + if (de) + dput(de); + } + /* Are we the last process to need status? */ mutex_lock(&sbi->wq_mutex); if (!--wq->wait_ctr) diff --git a/fs/befs/befs_fs_types.h b/fs/befs/befs_fs_types.h index e2595c2c403a..7893eaa1e58c 100644 --- a/fs/befs/befs_fs_types.h +++ b/fs/befs/befs_fs_types.h @@ -55,8 +55,12 @@ enum super_flags { }; #define BEFS_BYTEORDER_NATIVE 0x42494745 +#define BEFS_BYTEORDER_NATIVE_LE (__force fs32)cpu_to_le32(BEFS_BYTEORDER_NATIVE) +#define BEFS_BYTEORDER_NATIVE_BE (__force fs32)cpu_to_be32(BEFS_BYTEORDER_NATIVE) #define BEFS_SUPER_MAGIC BEFS_SUPER_MAGIC1 +#define BEFS_SUPER_MAGIC1_LE (__force fs32)cpu_to_le32(BEFS_SUPER_MAGIC1) +#define BEFS_SUPER_MAGIC1_BE (__force fs32)cpu_to_be32(BEFS_SUPER_MAGIC1) /* * Flags of inode diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 9286b2af893a..b6dfee37c7b7 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -809,8 +809,8 @@ befs_fill_super(struct super_block *sb, void *data, int silent) /* account for offset of super block on x86 */ disk_sb = (befs_super_block *) bh->b_data; - if ((le32_to_cpu(disk_sb->magic1) == BEFS_SUPER_MAGIC1) || - (be32_to_cpu(disk_sb->magic1) == BEFS_SUPER_MAGIC1)) { + if ((disk_sb->magic1 == BEFS_SUPER_MAGIC1_LE) || + (disk_sb->magic1 == BEFS_SUPER_MAGIC1_BE)) { befs_debug(sb, "Using PPC superblock location"); } else { befs_debug(sb, "Using x86 superblock location"); diff --git a/fs/befs/super.c b/fs/befs/super.c index 8c3401ff6d6a..41f2b4d0093e 100644 --- a/fs/befs/super.c +++ b/fs/befs/super.c @@ -26,10 +26,10 @@ befs_load_sb(struct super_block *sb, befs_super_block * disk_sb) befs_sb_info *befs_sb = BEFS_SB(sb); /* Check the byte order of the filesystem */ - if (le32_to_cpu(disk_sb->fs_byte_order) == BEFS_BYTEORDER_NATIVE) + if (disk_sb->fs_byte_order == BEFS_BYTEORDER_NATIVE_LE) befs_sb->byte_order = BEFS_BYTESEX_LE; - else if (be32_to_cpu(disk_sb->fs_byte_order) == BEFS_BYTEORDER_NATIVE) - befs_sb->byte_order = BEFS_BYTESEX_BE; + else if (disk_sb->fs_byte_order == BEFS_BYTEORDER_NATIVE_BE) + befs_sb->byte_order = BEFS_BYTESEX_BE; befs_sb->magic1 = fs32_to_cpu(sb, disk_sb->magic1); befs_sb->magic2 = fs32_to_cpu(sb, disk_sb->magic2); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 655ed8d30a86..e2159063198a 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -683,7 +683,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) * switch really is going to happen - do this in * flush_thread(). - akpm */ - SET_PERSONALITY(loc->elf_ex, 0); + SET_PERSONALITY(loc->elf_ex); interpreter = open_exec(elf_interpreter); retval = PTR_ERR(interpreter); @@ -734,7 +734,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) goto out_free_dentry; } else { /* Executables without an interpreter also need a personality */ - SET_PERSONALITY(loc->elf_ex, 0); + SET_PERSONALITY(loc->elf_ex); } /* Flush all traces of the currently running executable */ @@ -748,7 +748,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) /* Do this immediately, since STACK_TOP as used in setup_arg_pages may depend on the personality. */ - SET_PERSONALITY(loc->elf_ex, 0); + SET_PERSONALITY(loc->elf_ex); if (elf_read_implies_exec(loc->elf_ex, executable_stack)) current->personality |= READ_IMPLIES_EXEC; @@ -1156,16 +1156,24 @@ static int dump_seek(struct file *file, loff_t off) static unsigned long vma_dump_size(struct vm_area_struct *vma, unsigned long mm_flags) { +#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type)) + /* The vma can be set up to tell us the answer directly. */ if (vma->vm_flags & VM_ALWAYSDUMP) goto whole; + /* Hugetlb memory check */ + if (vma->vm_flags & VM_HUGETLB) { + if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED)) + goto whole; + if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE)) + goto whole; + } + /* Do not dump I/O mapped devices or special mappings */ if (vma->vm_flags & (VM_IO | VM_RESERVED)) return 0; -#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type)) - /* By default, dump shared memory if mapped from an anonymous file. */ if (vma->vm_flags & VM_SHARED) { if (vma->vm_file->f_path.dentry->d_inode->i_nlink == 0 ? diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 80c1f952ef78..0e8367c54624 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -25,6 +25,7 @@ #include <linux/fcntl.h> #include <linux/slab.h> #include <linux/pagemap.h> +#include <linux/security.h> #include <linux/highmem.h> #include <linux/highuid.h> #include <linux/personality.h> @@ -455,8 +456,19 @@ error_kill: } /*****************************************************************************/ + +#ifndef ELF_BASE_PLATFORM /* - * present useful information to the program + * AT_BASE_PLATFORM indicates the "real" hardware/microarchitecture. + * If the arch defines ELF_BASE_PLATFORM (in asm/elf.h), the value + * will be copied to the user stack in the same manner as AT_PLATFORM. + */ +#define ELF_BASE_PLATFORM NULL +#endif + +/* + * present useful information to the program by shovelling it onto the new + * process's stack */ static int create_elf_fdpic_tables(struct linux_binprm *bprm, struct mm_struct *mm, @@ -466,15 +478,19 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, unsigned long sp, csp, nitems; elf_caddr_t __user *argv, *envp; size_t platform_len = 0, len; - char *k_platform; - char __user *u_platform, *p; + char *k_platform, *k_base_platform; + char __user *u_platform, *u_base_platform, *p; long hwcap; int loop; int nr; /* reset for each csp adjustment */ - /* we're going to shovel a whole load of stuff onto the stack */ #ifdef CONFIG_MMU - sp = bprm->p; + /* In some cases (e.g. Hyper-Threading), we want to avoid L1 evictions + * by the processes running on the same package. One thing we can do is + * to shuffle the initial stack for them, so we give the architecture + * an opportunity to do so here. + */ + sp = arch_align_stack(bprm->p); #else sp = mm->start_stack; @@ -483,11 +499,14 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, return -EFAULT; #endif - /* get hold of platform and hardware capabilities masks for the machine - * we are running on. In some cases (Sparc), this info is impossible - * to get, in others (i386) it is merely difficult. - */ hwcap = ELF_HWCAP; + + /* + * If this architecture has a platform capability string, copy it + * to userspace. In some cases (Sparc), this info is impossible + * for userspace to get any other way, in others (i386) it is + * merely difficult. + */ k_platform = ELF_PLATFORM; u_platform = NULL; @@ -499,19 +518,20 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, return -EFAULT; } -#if defined(__i386__) && defined(CONFIG_SMP) - /* in some cases (e.g. Hyper-Threading), we want to avoid L1 evictions - * by the processes running on the same package. One thing we can do is - * to shuffle the initial stack for them. - * - * the conditionals here are unneeded, but kept in to make the code - * behaviour the same as pre change unless we have hyperthreaded - * processors. This keeps Mr Marcelo Person happier but should be - * removed for 2.5 + /* + * If this architecture has a "base" platform capability + * string, copy it to userspace. */ - if (smp_num_siblings > 1) - sp = sp - ((current->pid % 64) << 7); -#endif + k_base_platform = ELF_BASE_PLATFORM; + u_base_platform = NULL; + + if (k_base_platform) { + platform_len = strlen(k_base_platform) + 1; + sp -= platform_len; + u_base_platform = (char __user *) sp; + if (__copy_to_user(u_base_platform, k_base_platform, platform_len) != 0) + return -EFAULT; + } sp &= ~7UL; @@ -541,9 +561,13 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, } /* force 16 byte _final_ alignment here for generality */ -#define DLINFO_ITEMS 13 +#define DLINFO_ITEMS 15 + + nitems = 1 + DLINFO_ITEMS + (k_platform ? 1 : 0) + + (k_base_platform ? 1 : 0) + AT_VECTOR_SIZE_ARCH; - nitems = 1 + DLINFO_ITEMS + (k_platform ? 1 : 0) + AT_VECTOR_SIZE_ARCH; + if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) + nitems++; csp = sp; sp -= nitems * 2 * sizeof(unsigned long); @@ -575,6 +599,19 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, (elf_addr_t) (unsigned long) u_platform); } + if (k_base_platform) { + nr = 0; + csp -= 2 * sizeof(unsigned long); + NEW_AUX_ENT(AT_BASE_PLATFORM, + (elf_addr_t) (unsigned long) u_base_platform); + } + + if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) { + nr = 0; + csp -= 2 * sizeof(unsigned long); + NEW_AUX_ENT(AT_EXECFD, bprm->interp_data); + } + nr = 0; csp -= DLINFO_ITEMS * 2 * sizeof(unsigned long); NEW_AUX_ENT(AT_HWCAP, hwcap); @@ -590,6 +627,8 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, NEW_AUX_ENT(AT_EUID, (elf_addr_t) current->euid); NEW_AUX_ENT(AT_GID, (elf_addr_t) current->gid); NEW_AUX_ENT(AT_EGID, (elf_addr_t) current->egid); + NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); + NEW_AUX_ENT(AT_EXECFN, bprm->exec); #ifdef ARCH_DLINFO nr = 0; diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c index f9c88d0c8ced..32fb00b52cd0 100644 --- a/fs/binfmt_em86.c +++ b/fs/binfmt_em86.c @@ -43,7 +43,7 @@ static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs) return -ENOEXEC; } - bprm->sh_bang = 1; /* Well, the bang-shell is implicit... */ + bprm->recursion_depth++; /* Well, the bang-shell is implicit... */ allow_write_access(bprm->file); fput(bprm->file); bprm->file = NULL; diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index dfc0197905ca..ccb781a6a804 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -229,13 +229,13 @@ static int decompress_exec( ret = 10; if (buf[3] & EXTRA_FIELD) { ret += 2 + buf[10] + (buf[11] << 8); - if (unlikely(LBUFSIZE == ret)) { + if (unlikely(LBUFSIZE <= ret)) { DBG_FLT("binfmt_flat: buffer overflow (EXTRA)?\n"); goto out_free_buf; } } if (buf[3] & ORIG_NAME) { - for (; ret < LBUFSIZE && (buf[ret] != 0); ret++) + while (ret < LBUFSIZE && buf[ret++] != 0) ; if (unlikely(LBUFSIZE == ret)) { DBG_FLT("binfmt_flat: buffer overflow (ORIG_NAME)?\n"); @@ -243,7 +243,7 @@ static int decompress_exec( } } if (buf[3] & COMMENT) { - for (; ret < LBUFSIZE && (buf[ret] != 0); ret++) + while (ret < LBUFSIZE && buf[ret++] != 0) ; if (unlikely(LBUFSIZE == ret)) { DBG_FLT("binfmt_flat: buffer overflow (COMMENT)?\n"); diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 8d7e88e02e0f..f2744ab4e5b3 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -117,7 +117,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) goto _ret; retval = -ENOEXEC; - if (bprm->misc_bang) + if (bprm->recursion_depth > BINPRM_MAX_RECURSION) goto _ret; /* to keep locking time low, we copy the interpreter string */ @@ -197,7 +197,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (retval < 0) goto _error; - bprm->misc_bang = 1; + bprm->recursion_depth++; retval = search_binary_handler (bprm, regs); if (retval < 0) diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index 9e3963f7ebf1..08343505e184 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -22,14 +22,15 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs) char interp[BINPRM_BUF_SIZE]; int retval; - if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!') || (bprm->sh_bang)) + if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!') || + (bprm->recursion_depth > BINPRM_MAX_RECURSION)) return -ENOEXEC; /* * This section does the #! interpretation. * Sorta complicated, but hopefully it will work. -TYT */ - bprm->sh_bang = 1; + bprm->recursion_depth++; allow_write_access(bprm->file); fput(bprm->file); bprm->file = NULL; diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c index 68be580ba289..74e587a52796 100644 --- a/fs/binfmt_som.c +++ b/fs/binfmt_som.c @@ -306,3 +306,5 @@ static void __exit exit_som_binfmt(void) core_initcall(init_som_binfmt); module_exit(exit_som_binfmt); + +MODULE_LICENSE("GPL"); diff --git a/fs/block_dev.c b/fs/block_dev.c index d84f0469a016..218408eed1bb 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1262,7 +1262,7 @@ EXPORT_SYMBOL(ioctl_by_bdev); /** * lookup_bdev - lookup a struct block_device by name - * @pathname: special file representing the block device + * @path: special file representing the block device * * Get a reference to the blockdevice at @pathname in the current * namespace if possible and return it. Return ERR_PTR(error) diff --git a/fs/buffer.c b/fs/buffer.c index ac78d4c19b3b..6569fda5cfed 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -76,8 +76,7 @@ EXPORT_SYMBOL(__lock_buffer); void unlock_buffer(struct buffer_head *bh) { - smp_mb__before_clear_bit(); - clear_buffer_locked(bh); + clear_bit_unlock(BH_Lock, &bh->b_state); smp_mb__after_clear_bit(); wake_up_bit(&bh->b_state, BH_Lock); } diff --git a/fs/char_dev.c b/fs/char_dev.c index 3cb7cda3d780..262fa10e213d 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -22,9 +22,6 @@ #include <linux/mutex.h> #include <linux/backing-dev.h> -#ifdef CONFIG_KMOD -#include <linux/kmod.h> -#endif #include "internal.h" /* diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig new file mode 100644 index 000000000000..341a98965bd0 --- /dev/null +++ b/fs/cifs/Kconfig @@ -0,0 +1,142 @@ +config CIFS + tristate "CIFS support (advanced network filesystem, SMBFS successor)" + depends on INET + select NLS + help + This is the client VFS module for the Common Internet File System + (CIFS) protocol which is the successor to the Server Message Block + (SMB) protocol, the native file sharing mechanism for most early + PC operating systems. The CIFS protocol is fully supported by + file servers such as Windows 2000 (including Windows 2003, NT 4 + and Windows XP) as well by Samba (which provides excellent CIFS + server support for Linux and many other operating systems). Limited + support for OS/2 and Windows ME and similar servers is provided as + well. + + The cifs module provides an advanced network file system + client for mounting to CIFS compliant servers. It includes + support for DFS (hierarchical name space), secure per-user + session establishment via Kerberos or NTLM or NTLMv2, + safe distributed caching (oplock), optional packet + signing, Unicode and other internationalization improvements. + If you need to mount to Samba or Windows from this machine, say Y. + +config CIFS_STATS + bool "CIFS statistics" + depends on CIFS + help + Enabling this option will cause statistics for each server share + mounted by the cifs client to be displayed in /proc/fs/cifs/Stats + +config CIFS_STATS2 + bool "Extended statistics" + depends on CIFS_STATS + help + Enabling this option will allow more detailed statistics on SMB + request timing to be displayed in /proc/fs/cifs/DebugData and also + allow optional logging of slow responses to dmesg (depending on the + value of /proc/fs/cifs/cifsFYI, see fs/cifs/README for more details). + These additional statistics may have a minor effect on performance + and memory utilization. + + Unless you are a developer or are doing network performance analysis + or tuning, say N. + +config CIFS_WEAK_PW_HASH + bool "Support legacy servers which use weaker LANMAN security" + depends on CIFS + help + Modern CIFS servers including Samba and most Windows versions + (since 1997) support stronger NTLM (and even NTLMv2 and Kerberos) + security mechanisms. These hash the password more securely + than the mechanisms used in the older LANMAN version of the + SMB protocol but LANMAN based authentication is needed to + establish sessions with some old SMB servers. + + Enabling this option allows the cifs module to mount to older + LANMAN based servers such as OS/2 and Windows 95, but such + mounts may be less secure than mounts using NTLM or more recent + security mechanisms if you are on a public network. Unless you + have a need to access old SMB servers (and are on a private + network) you probably want to say N. Even if this support + is enabled in the kernel build, LANMAN authentication will not be + used automatically. At runtime LANMAN mounts are disabled but + can be set to required (or optional) either in + /proc/fs/cifs (see fs/cifs/README for more detail) or via an + option on the mount command. This support is disabled by + default in order to reduce the possibility of a downgrade + attack. + + If unsure, say N. + +config CIFS_UPCALL + bool "Kerberos/SPNEGO advanced session setup" + depends on CIFS && KEYS + help + Enables an upcall mechanism for CIFS which accesses + userspace helper utilities to provide SPNEGO packaged (RFC 4178) + Kerberos tickets which are needed to mount to certain secure servers + (for which more secure Kerberos authentication is required). If + unsure, say N. + +config CIFS_XATTR + bool "CIFS extended attributes" + depends on CIFS + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + <http://acl.bestbits.at/> for details). CIFS maps the name of + extended attributes beginning with the user namespace prefix + to SMB/CIFS EAs. EAs are stored on Windows servers without the + user namespace prefix, but their names are seen by Linux cifs clients + prefaced by the user namespace prefix. The system namespace + (used by some filesystems to store ACLs) is not supported at + this time. + + If unsure, say N. + +config CIFS_POSIX + bool "CIFS POSIX Extensions" + depends on CIFS_XATTR + help + Enabling this option will cause the cifs client to attempt to + negotiate a newer dialect with servers, such as Samba 3.0.5 + or later, that optionally can handle more POSIX like (rather + than Windows like) file behavior. It also enables + support for POSIX ACLs (getfacl and setfacl) to servers + (such as Samba 3.10 and later) which can negotiate + CIFS POSIX ACL support. If unsure, say N. + +config CIFS_DEBUG2 + bool "Enable additional CIFS debugging routines" + depends on CIFS + help + Enabling this option adds a few more debugging routines + to the cifs code which slightly increases the size of + the cifs module and can cause additional logging of debug + messages in some error paths, slowing performance. This + option can be turned off unless you are debugging + cifs problems. If unsure, say N. + +config CIFS_EXPERIMENTAL + bool "CIFS Experimental Features (EXPERIMENTAL)" + depends on CIFS && EXPERIMENTAL + help + Enables cifs features under testing. These features are + experimental and currently include DFS support and directory + change notification ie fcntl(F_DNOTIFY), as well as the upcall + mechanism which will be used for Kerberos session negotiation + and uid remapping. Some of these features also may depend on + setting a value of 1 to the pseudo-file /proc/fs/cifs/Experimental + (which is disabled by default). See the file fs/cifs/README + for more details. If unsure, say N. + +config CIFS_DFS_UPCALL + bool "DFS feature support (EXPERIMENTAL)" + depends on CIFS_EXPERIMENTAL + depends on KEYS + help + Enables an upcall mechanism for CIFS which contacts userspace + helper utilities to provide server name resolution (host names to + IP addresses) which is needed for implicit mounts of DFS junction + points. If unsure, say N. diff --git a/fs/cifs/file.c b/fs/cifs/file.c index c4a8a0605125..62d8bd8f14c0 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1791,7 +1791,7 @@ static void cifs_copy_cache_pages(struct address_space *mapping, SetPageUptodate(page); unlock_page(page); if (!pagevec_add(plru_pvec, page)) - __pagevec_lru_add(plru_pvec); + __pagevec_lru_add_file(plru_pvec); data += PAGE_CACHE_SIZE; } return; @@ -1925,7 +1925,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, bytes_read = 0; } - pagevec_lru_add(&lru_pvec); + pagevec_lru_add_file(&lru_pvec); /* need to free smb_read_data buf before exit */ if (smb_read_data) { diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index 0d9b80ec689c..cfd29da714d1 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -362,9 +362,8 @@ static int init_coda_psdev(void) goto out_chrdev; } for (i = 0; i < MAX_CODADEVS; i++) - device_create_drvdata(coda_psdev_class, NULL, - MKDEV(CODA_PSDEV_MAJOR, i), - NULL, "cfs%d", i); + device_create(coda_psdev_class, NULL, + MKDEV(CODA_PSDEV_MAJOR, i), NULL, "cfs%d", i); coda_sysctl_init(); goto out; diff --git a/fs/compat.c b/fs/compat.c index 075d0509970d..5f9ec449c799 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -137,6 +137,45 @@ asmlinkage long compat_sys_utimes(char __user *filename, struct compat_timeval _ return compat_sys_futimesat(AT_FDCWD, filename, t); } +static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) +{ + compat_ino_t ino = stat->ino; + typeof(ubuf->st_uid) uid = 0; + typeof(ubuf->st_gid) gid = 0; + int err; + + SET_UID(uid, stat->uid); + SET_GID(gid, stat->gid); + + if ((u64) stat->size > MAX_NON_LFS || + !old_valid_dev(stat->dev) || + !old_valid_dev(stat->rdev)) + return -EOVERFLOW; + if (sizeof(ino) < sizeof(stat->ino) && ino != stat->ino) + return -EOVERFLOW; + + if (clear_user(ubuf, sizeof(*ubuf))) + return -EFAULT; + + err = __put_user(old_encode_dev(stat->dev), &ubuf->st_dev); + err |= __put_user(ino, &ubuf->st_ino); + err |= __put_user(stat->mode, &ubuf->st_mode); + err |= __put_user(stat->nlink, &ubuf->st_nlink); + err |= __put_user(uid, &ubuf->st_uid); + err |= __put_user(gid, &ubuf->st_gid); + err |= __put_user(old_encode_dev(stat->rdev), &ubuf->st_rdev); + err |= __put_user(stat->size, &ubuf->st_size); + err |= __put_user(stat->atime.tv_sec, &ubuf->st_atime); + err |= __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec); + err |= __put_user(stat->mtime.tv_sec, &ubuf->st_mtime); + err |= __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec); + err |= __put_user(stat->ctime.tv_sec, &ubuf->st_ctime); + err |= __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec); + err |= __put_user(stat->blksize, &ubuf->st_blksize); + err |= __put_user(stat->blocks, &ubuf->st_blocks); + return err; +} + asmlinkage long compat_sys_newstat(char __user * filename, struct compat_stat __user *statbuf) { @@ -1239,7 +1278,7 @@ static int compat_count(compat_uptr_t __user *argv, int max) if (!p) break; argv++; - if(++i > max) + if (i++ >= max) return -E2BIG; } } diff --git a/fs/direct-io.c b/fs/direct-io.c index 9606ee848fd8..af0558dbe8b7 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -5,11 +5,11 @@ * * O_DIRECT * - * 04Jul2002 akpm@zip.com.au + * 04Jul2002 Andrew Morton * Initial version * 11Sep2002 janetinc@us.ibm.com * added readv/writev support. - * 29Oct2002 akpm@zip.com.au + * 29Oct2002 Andrew Morton * rewrote bio_add_page() support. * 30Oct2002 pbadari@us.ibm.com * added support for non-aligned IO. diff --git a/fs/dquot.c b/fs/dquot.c index ad7e59003e04..da30a27f2242 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -9,8 +9,6 @@ * implementation is based on one of the several variants of the LINUX * inode-subsystem with added complexity of the diskquota system. * - * Version: $Id: dquot.c,v 6.3 1996/11/17 18:35:34 mvw Exp mvw $ - * * Author: Marco van Wieringen <mvw@planets.elm.net> * * Fixes: Dmitry Gorodchanin <pgmdsg@ibi.com>, 11 Feb 96 diff --git a/fs/ecryptfs/Makefile b/fs/ecryptfs/Makefile index b4755a85996e..2cc9ee4ad2eb 100644 --- a/fs/ecryptfs/Makefile +++ b/fs/ecryptfs/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_ECRYPT_FS) += ecryptfs.o -ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o netlink.o miscdev.o kthread.o debug.o +ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o miscdev.o kthread.o debug.o diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index b73fb752c5f8..3504cf9df358 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -79,11 +79,6 @@ #define ECRYPTFS_MAX_PKI_NAME_BYTES 16 #define ECRYPTFS_DEFAULT_NUM_USERS 4 #define ECRYPTFS_MAX_NUM_USERS 32768 -#define ECRYPTFS_TRANSPORT_NETLINK 0 -#define ECRYPTFS_TRANSPORT_CONNECTOR 1 -#define ECRYPTFS_TRANSPORT_RELAYFS 2 -#define ECRYPTFS_TRANSPORT_MISCDEV 3 -#define ECRYPTFS_DEFAULT_TRANSPORT ECRYPTFS_TRANSPORT_MISCDEV #define ECRYPTFS_XATTR_NAME "user.ecryptfs" #define RFC2440_CIPHER_DES3_EDE 0x02 @@ -400,8 +395,6 @@ struct ecryptfs_msg_ctx { struct mutex mux; }; -extern unsigned int ecryptfs_transport; - struct ecryptfs_daemon; struct ecryptfs_daemon { @@ -627,31 +620,20 @@ int ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode); -int ecryptfs_process_helo(unsigned int transport, uid_t euid, - struct user_namespace *user_ns, struct pid *pid); +int ecryptfs_process_helo(uid_t euid, struct user_namespace *user_ns, + struct pid *pid); int ecryptfs_process_quit(uid_t euid, struct user_namespace *user_ns, struct pid *pid); int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid, struct user_namespace *user_ns, struct pid *pid, u32 seq); -int ecryptfs_send_message(unsigned int transport, char *data, int data_len, +int ecryptfs_send_message(char *data, int data_len, struct ecryptfs_msg_ctx **msg_ctx); int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx, struct ecryptfs_message **emsg); -int ecryptfs_init_messaging(unsigned int transport); -void ecryptfs_release_messaging(unsigned int transport); +int ecryptfs_init_messaging(void); +void ecryptfs_release_messaging(void); -int ecryptfs_send_netlink(char *data, int data_len, - struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type, - u16 msg_flags, struct pid *daemon_pid); -int ecryptfs_init_netlink(void); -void ecryptfs_release_netlink(void); - -int ecryptfs_send_connector(char *data, int data_len, - struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type, - u16 msg_flags, struct pid *daemon_pid); -int ecryptfs_init_connector(void); -void ecryptfs_release_connector(void); void ecryptfs_write_header_metadata(char *virt, struct ecryptfs_crypt_stat *crypt_stat, diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 9244d653743e..eb3dc4c7ac06 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -71,12 +71,11 @@ struct ecryptfs_getdents_callback { void *dirent; struct dentry *dentry; filldir_t filldir; - int err; int filldir_called; int entries_written; }; -/* Inspired by generic filldir in fs/readir.c */ +/* Inspired by generic filldir in fs/readdir.c */ static int ecryptfs_filldir(void *dirent, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) @@ -125,18 +124,18 @@ static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir) buf.dirent = dirent; buf.dentry = file->f_path.dentry; buf.filldir = filldir; -retry: buf.filldir_called = 0; buf.entries_written = 0; - buf.err = 0; rc = vfs_readdir(lower_file, ecryptfs_filldir, (void *)&buf); - if (buf.err) - rc = buf.err; - if (buf.filldir_called && !buf.entries_written) - goto retry; file->f_pos = lower_file->f_pos; + if (rc < 0) + goto out; + if (buf.filldir_called && !buf.entries_written) + goto out; if (rc >= 0) - fsstack_copy_attr_atime(inode, lower_file->f_path.dentry->d_inode); + fsstack_copy_attr_atime(inode, + lower_file->f_path.dentry->d_inode); +out: return rc; } diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index f5b76a331b9c..e22bc3961345 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -234,8 +234,8 @@ parse_tag_65_packet(struct ecryptfs_session_key *session_key, u8 *cipher_code, } i += data_len; if (message_len < (i + m_size)) { - ecryptfs_printk(KERN_ERR, "The received netlink message is " - "shorter than expected\n"); + ecryptfs_printk(KERN_ERR, "The message received from ecryptfsd " + "is shorter than expected\n"); rc = -EIO; goto out; } @@ -438,8 +438,8 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, struct ecryptfs_msg_ctx *msg_ctx; struct ecryptfs_message *msg = NULL; char *auth_tok_sig; - char *netlink_message; - size_t netlink_message_length; + char *payload; + size_t payload_len; int rc; rc = ecryptfs_get_auth_tok_sig(&auth_tok_sig, auth_tok); @@ -449,15 +449,15 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, goto out; } rc = write_tag_64_packet(auth_tok_sig, &(auth_tok->session_key), - &netlink_message, &netlink_message_length); + &payload, &payload_len); if (rc) { ecryptfs_printk(KERN_ERR, "Failed to write tag 64 packet\n"); goto out; } - rc = ecryptfs_send_message(ecryptfs_transport, netlink_message, - netlink_message_length, &msg_ctx); + rc = ecryptfs_send_message(payload, payload_len, &msg_ctx); if (rc) { - ecryptfs_printk(KERN_ERR, "Error sending netlink message\n"); + ecryptfs_printk(KERN_ERR, "Error sending message to " + "ecryptfsd\n"); goto out; } rc = ecryptfs_wait_for_response(msg_ctx, &msg); @@ -1333,23 +1333,22 @@ pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok, struct ecryptfs_key_record *key_rec) { struct ecryptfs_msg_ctx *msg_ctx = NULL; - char *netlink_payload; - size_t netlink_payload_length; + char *payload = NULL; + size_t payload_len; struct ecryptfs_message *msg; int rc; rc = write_tag_66_packet(auth_tok->token.private_key.signature, ecryptfs_code_for_cipher_string(crypt_stat), - crypt_stat, &netlink_payload, - &netlink_payload_length); + crypt_stat, &payload, &payload_len); if (rc) { ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n"); goto out; } - rc = ecryptfs_send_message(ecryptfs_transport, netlink_payload, - netlink_payload_length, &msg_ctx); + rc = ecryptfs_send_message(payload, payload_len, &msg_ctx); if (rc) { - ecryptfs_printk(KERN_ERR, "Error sending netlink message\n"); + ecryptfs_printk(KERN_ERR, "Error sending message to " + "ecryptfsd\n"); goto out; } rc = ecryptfs_wait_for_response(msg_ctx, &msg); @@ -1364,8 +1363,7 @@ pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok, ecryptfs_printk(KERN_ERR, "Error parsing tag 67 packet\n"); kfree(msg); out: - if (netlink_payload) - kfree(netlink_payload); + kfree(payload); return rc; } /** diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 8ebe9a5d1d99..046e027a4cb1 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -30,7 +30,6 @@ #include <linux/namei.h> #include <linux/skbuff.h> #include <linux/crypto.h> -#include <linux/netlink.h> #include <linux/mount.h> #include <linux/pagemap.h> #include <linux/key.h> @@ -49,8 +48,7 @@ MODULE_PARM_DESC(ecryptfs_verbosity, "0, which is Quiet)"); /** - * Module parameter that defines the number of netlink message buffer - * elements + * Module parameter that defines the number of message buffer elements */ unsigned int ecryptfs_message_buf_len = ECRYPTFS_DEFAULT_MSG_CTX_ELEMS; @@ -60,9 +58,9 @@ MODULE_PARM_DESC(ecryptfs_message_buf_len, /** * Module parameter that defines the maximum guaranteed amount of time to wait - * for a response through netlink. The actual sleep time will be, more than + * for a response from ecryptfsd. The actual sleep time will be, more than * likely, a small amount greater than this specified value, but only less if - * the netlink message successfully arrives. + * the message successfully arrives. */ signed long ecryptfs_message_wait_timeout = ECRYPTFS_MAX_MSG_CTX_TTL / HZ; @@ -83,8 +81,6 @@ module_param(ecryptfs_number_of_users, uint, 0); MODULE_PARM_DESC(ecryptfs_number_of_users, "An estimate of the number of " "concurrent users of eCryptfs"); -unsigned int ecryptfs_transport = ECRYPTFS_DEFAULT_TRANSPORT; - void __ecryptfs_printk(const char *fmt, ...) { va_list args; @@ -779,10 +775,11 @@ static int __init ecryptfs_init(void) "rc = [%d]\n", __func__, rc); goto out_do_sysfs_unregistration; } - rc = ecryptfs_init_messaging(ecryptfs_transport); + rc = ecryptfs_init_messaging(); if (rc) { printk(KERN_ERR "Failure occured while attempting to " - "initialize the eCryptfs netlink socket\n"); + "initialize the communications channel to " + "ecryptfsd\n"); goto out_destroy_kthread; } rc = ecryptfs_init_crypto(); @@ -797,7 +794,7 @@ static int __init ecryptfs_init(void) goto out; out_release_messaging: - ecryptfs_release_messaging(ecryptfs_transport); + ecryptfs_release_messaging(); out_destroy_kthread: ecryptfs_destroy_kthread(); out_do_sysfs_unregistration: @@ -818,7 +815,7 @@ static void __exit ecryptfs_exit(void) if (rc) printk(KERN_ERR "Failure whilst attempting to destroy crypto; " "rc = [%d]\n", rc); - ecryptfs_release_messaging(ecryptfs_transport); + ecryptfs_release_messaging(); ecryptfs_destroy_kthread(); do_sysfs_unregistration(); unregister_filesystem(&ecryptfs_fs_type); diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index 1b5c20058acb..c6983978a31e 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -134,12 +134,11 @@ out: } static int -ecryptfs_send_message_locked(unsigned int transport, char *data, int data_len, - u8 msg_type, struct ecryptfs_msg_ctx **msg_ctx); +ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type, + struct ecryptfs_msg_ctx **msg_ctx); /** * ecryptfs_send_raw_message - * @transport: Transport type * @msg_type: Message type * @daemon: Daemon struct for recipient of message * @@ -150,38 +149,25 @@ ecryptfs_send_message_locked(unsigned int transport, char *data, int data_len, * * Returns zero on success; non-zero otherwise */ -static int ecryptfs_send_raw_message(unsigned int transport, u8 msg_type, +static int ecryptfs_send_raw_message(u8 msg_type, struct ecryptfs_daemon *daemon) { struct ecryptfs_msg_ctx *msg_ctx; int rc; - switch(transport) { - case ECRYPTFS_TRANSPORT_NETLINK: - rc = ecryptfs_send_netlink(NULL, 0, NULL, msg_type, 0, - daemon->pid); - break; - case ECRYPTFS_TRANSPORT_MISCDEV: - rc = ecryptfs_send_message_locked(transport, NULL, 0, msg_type, - &msg_ctx); - if (rc) { - printk(KERN_ERR "%s: Error whilst attempting to send " - "message via procfs; rc = [%d]\n", __func__, rc); - goto out; - } - /* Raw messages are logically context-free (e.g., no - * reply is expected), so we set the state of the - * ecryptfs_msg_ctx object to indicate that it should - * be freed as soon as the transport sends out the message. */ - mutex_lock(&msg_ctx->mux); - msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_NO_REPLY; - mutex_unlock(&msg_ctx->mux); - break; - case ECRYPTFS_TRANSPORT_CONNECTOR: - case ECRYPTFS_TRANSPORT_RELAYFS: - default: - rc = -ENOSYS; + rc = ecryptfs_send_message_locked(NULL, 0, msg_type, &msg_ctx); + if (rc) { + printk(KERN_ERR "%s: Error whilst attempting to send " + "message to ecryptfsd; rc = [%d]\n", __func__, rc); + goto out; } + /* Raw messages are logically context-free (e.g., no + * reply is expected), so we set the state of the + * ecryptfs_msg_ctx object to indicate that it should + * be freed as soon as the message is sent. */ + mutex_lock(&msg_ctx->mux); + msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_NO_REPLY; + mutex_unlock(&msg_ctx->mux); out: return rc; } @@ -227,7 +213,6 @@ out: /** * ecryptfs_process_helo - * @transport: The underlying transport (netlink, etc.) * @euid: The user ID owner of the message * @user_ns: The namespace in which @euid applies * @pid: The process ID for the userspace program that sent the @@ -239,8 +224,8 @@ out: * Returns zero after adding a new daemon to the hash list; * non-zero otherwise. */ -int ecryptfs_process_helo(unsigned int transport, uid_t euid, - struct user_namespace *user_ns, struct pid *pid) +int ecryptfs_process_helo(uid_t euid, struct user_namespace *user_ns, + struct pid *pid) { struct ecryptfs_daemon *new_daemon; struct ecryptfs_daemon *old_daemon; @@ -252,8 +237,7 @@ int ecryptfs_process_helo(unsigned int transport, uid_t euid, printk(KERN_WARNING "Received request from user [%d] " "to register daemon [0x%p]; unregistering daemon " "[0x%p]\n", euid, pid, old_daemon->pid); - rc = ecryptfs_send_raw_message(transport, ECRYPTFS_MSG_QUIT, - old_daemon); + rc = ecryptfs_send_raw_message(ECRYPTFS_MSG_QUIT, old_daemon); if (rc) printk(KERN_WARNING "Failed to send QUIT " "message to daemon [0x%p]; rc = [%d]\n", @@ -467,8 +451,6 @@ out: /** * ecryptfs_send_message_locked - * @transport: The transport over which to send the message (i.e., - * netlink) * @data: The data to send * @data_len: The length of data * @msg_ctx: The message context allocated for the send @@ -478,8 +460,8 @@ out: * Returns zero on success; non-zero otherwise */ static int -ecryptfs_send_message_locked(unsigned int transport, char *data, int data_len, - u8 msg_type, struct ecryptfs_msg_ctx **msg_ctx) +ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type, + struct ecryptfs_msg_ctx **msg_ctx) { struct ecryptfs_daemon *daemon; int rc; @@ -503,20 +485,8 @@ ecryptfs_send_message_locked(unsigned int transport, char *data, int data_len, ecryptfs_msg_ctx_free_to_alloc(*msg_ctx); mutex_unlock(&(*msg_ctx)->mux); mutex_unlock(&ecryptfs_msg_ctx_lists_mux); - switch (transport) { - case ECRYPTFS_TRANSPORT_NETLINK: - rc = ecryptfs_send_netlink(data, data_len, *msg_ctx, msg_type, - 0, daemon->pid); - break; - case ECRYPTFS_TRANSPORT_MISCDEV: - rc = ecryptfs_send_miscdev(data, data_len, *msg_ctx, msg_type, - 0, daemon); - break; - case ECRYPTFS_TRANSPORT_CONNECTOR: - case ECRYPTFS_TRANSPORT_RELAYFS: - default: - rc = -ENOSYS; - } + rc = ecryptfs_send_miscdev(data, data_len, *msg_ctx, msg_type, 0, + daemon); if (rc) printk(KERN_ERR "%s: Error attempting to send message to " "userspace daemon; rc = [%d]\n", __func__, rc); @@ -526,8 +496,6 @@ out: /** * ecryptfs_send_message - * @transport: The transport over which to send the message (i.e., - * netlink) * @data: The data to send * @data_len: The length of data * @msg_ctx: The message context allocated for the send @@ -536,14 +504,14 @@ out: * * Returns zero on success; non-zero otherwise */ -int ecryptfs_send_message(unsigned int transport, char *data, int data_len, +int ecryptfs_send_message(char *data, int data_len, struct ecryptfs_msg_ctx **msg_ctx) { int rc; mutex_lock(&ecryptfs_daemon_hash_mux); - rc = ecryptfs_send_message_locked(transport, data, data_len, - ECRYPTFS_MSG_REQUEST, msg_ctx); + rc = ecryptfs_send_message_locked(data, data_len, ECRYPTFS_MSG_REQUEST, + msg_ctx); mutex_unlock(&ecryptfs_daemon_hash_mux); return rc; } @@ -586,7 +554,7 @@ sleep: return rc; } -int ecryptfs_init_messaging(unsigned int transport) +int ecryptfs_init_messaging(void) { int i; int rc = 0; @@ -639,27 +607,14 @@ int ecryptfs_init_messaging(unsigned int transport) mutex_unlock(&ecryptfs_msg_ctx_arr[i].mux); } mutex_unlock(&ecryptfs_msg_ctx_lists_mux); - switch(transport) { - case ECRYPTFS_TRANSPORT_NETLINK: - rc = ecryptfs_init_netlink(); - if (rc) - ecryptfs_release_messaging(transport); - break; - case ECRYPTFS_TRANSPORT_MISCDEV: - rc = ecryptfs_init_ecryptfs_miscdev(); - if (rc) - ecryptfs_release_messaging(transport); - break; - case ECRYPTFS_TRANSPORT_CONNECTOR: - case ECRYPTFS_TRANSPORT_RELAYFS: - default: - rc = -ENOSYS; - } + rc = ecryptfs_init_ecryptfs_miscdev(); + if (rc) + ecryptfs_release_messaging(); out: return rc; } -void ecryptfs_release_messaging(unsigned int transport) +void ecryptfs_release_messaging(void) { if (ecryptfs_msg_ctx_arr) { int i; @@ -698,17 +653,6 @@ void ecryptfs_release_messaging(unsigned int transport) kfree(ecryptfs_daemon_hash); mutex_unlock(&ecryptfs_daemon_hash_mux); } - switch(transport) { - case ECRYPTFS_TRANSPORT_NETLINK: - ecryptfs_release_netlink(); - break; - case ECRYPTFS_TRANSPORT_MISCDEV: - ecryptfs_destroy_ecryptfs_miscdev(); - break; - case ECRYPTFS_TRANSPORT_CONNECTOR: - case ECRYPTFS_TRANSPORT_RELAYFS: - default: - break; - } + ecryptfs_destroy_ecryptfs_miscdev(); return; } diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 245c2dc02d5c..04d7b3fa1ac6 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -265,22 +265,34 @@ out: } /** - * ecryptfs_prepare_write + * ecryptfs_write_begin * @file: The eCryptfs file - * @page: The eCryptfs page - * @from: The start byte from which we will write - * @to: The end byte to which we will write + * @mapping: The eCryptfs object + * @pos: The file offset at which to start writing + * @len: Length of the write + * @flags: Various flags + * @pagep: Pointer to return the page + * @fsdata: Pointer to return fs data (unused) * * This function must zero any hole we create * * Returns zero on success; non-zero otherwise */ -static int ecryptfs_prepare_write(struct file *file, struct page *page, - unsigned from, unsigned to) +static int ecryptfs_write_begin(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) { + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct page *page; loff_t prev_page_end_size; int rc = 0; + page = __grab_cache_page(mapping, index); + if (!page) + return -ENOMEM; + *pagep = page; + if (!PageUptodate(page)) { struct ecryptfs_crypt_stat *crypt_stat = &ecryptfs_inode_to_private( @@ -289,8 +301,7 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page, if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED) || (crypt_stat->flags & ECRYPTFS_NEW_FILE)) { rc = ecryptfs_read_lower_page_segment( - page, page->index, 0, PAGE_CACHE_SIZE, - page->mapping->host); + page, index, 0, PAGE_CACHE_SIZE, mapping->host); if (rc) { printk(KERN_ERR "%s: Error attemping to read " "lower page segment; rc = [%d]\n", @@ -316,8 +327,8 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page, SetPageUptodate(page); } else { rc = ecryptfs_read_lower_page_segment( - page, page->index, 0, PAGE_CACHE_SIZE, - page->mapping->host); + page, index, 0, PAGE_CACHE_SIZE, + mapping->host); if (rc) { printk(KERN_ERR "%s: Error reading " "page; rc = [%d]\n", @@ -339,10 +350,10 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page, SetPageUptodate(page); } } - prev_page_end_size = ((loff_t)page->index << PAGE_CACHE_SHIFT); + prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT); /* If creating a page or more of holes, zero them out via truncate. * Note, this will increase i_size. */ - if (page->index != 0) { + if (index != 0) { if (prev_page_end_size > i_size_read(page->mapping->host)) { rc = ecryptfs_truncate(file->f_path.dentry, prev_page_end_size); @@ -357,8 +368,8 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page, } /* Writing to a new page, and creating a small hole from start * of page? Zero it out. */ - if ((i_size_read(page->mapping->host) == prev_page_end_size) - && (from != 0)) + if ((i_size_read(mapping->host) == prev_page_end_size) + && (pos != 0)) zero_user(page, 0, PAGE_CACHE_SIZE); out: return rc; @@ -445,21 +456,28 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode) } /** - * ecryptfs_commit_write + * ecryptfs_write_end * @file: The eCryptfs file object + * @mapping: The eCryptfs object + * @pos: The file position + * @len: The length of the data (unused) + * @copied: The amount of data copied * @page: The eCryptfs page - * @from: Ignored (we rotate the page IV on each write) - * @to: Ignored + * @fsdata: The fsdata (unused) * * This is where we encrypt the data and pass the encrypted data to * the lower filesystem. In OpenPGP-compatible mode, we operate on * entire underlying packets. */ -static int ecryptfs_commit_write(struct file *file, struct page *page, - unsigned from, unsigned to) +static int ecryptfs_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { - loff_t pos; - struct inode *ecryptfs_inode = page->mapping->host; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + unsigned to = from + copied; + struct inode *ecryptfs_inode = mapping->host; struct ecryptfs_crypt_stat *crypt_stat = &ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat; int rc; @@ -471,25 +489,22 @@ static int ecryptfs_commit_write(struct file *file, struct page *page, } else ecryptfs_printk(KERN_DEBUG, "Not a new file\n"); ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page" - "(page w/ index = [0x%.16x], to = [%d])\n", page->index, - to); + "(page w/ index = [0x%.16x], to = [%d])\n", index, to); /* Fills in zeros if 'to' goes beyond inode size */ rc = fill_zeros_to_end_of_page(page, to); if (rc) { ecryptfs_printk(KERN_WARNING, "Error attempting to fill " - "zeros in page with index = [0x%.16x]\n", - page->index); + "zeros in page with index = [0x%.16x]\n", index); goto out; } rc = ecryptfs_encrypt_page(page); if (rc) { ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper " - "index [0x%.16x])\n", page->index); + "index [0x%.16x])\n", index); goto out; } - pos = (((loff_t)page->index) << PAGE_CACHE_SHIFT) + to; - if (pos > i_size_read(ecryptfs_inode)) { - i_size_write(ecryptfs_inode, pos); + if (pos + copied > i_size_read(ecryptfs_inode)) { + i_size_write(ecryptfs_inode, pos + copied); ecryptfs_printk(KERN_DEBUG, "Expanded file size to " "[0x%.16x]\n", i_size_read(ecryptfs_inode)); } @@ -497,7 +512,11 @@ static int ecryptfs_commit_write(struct file *file, struct page *page, if (rc) printk(KERN_ERR "Error writing inode size to metadata; " "rc = [%d]\n", rc); + else + rc = copied; out: + unlock_page(page); + page_cache_release(page); return rc; } @@ -518,7 +537,7 @@ static sector_t ecryptfs_bmap(struct address_space *mapping, sector_t block) struct address_space_operations ecryptfs_aops = { .writepage = ecryptfs_writepage, .readpage = ecryptfs_readpage, - .prepare_write = ecryptfs_prepare_write, - .commit_write = ecryptfs_commit_write, + .write_begin = ecryptfs_write_begin, + .write_end = ecryptfs_write_end, .bmap = ecryptfs_bmap, }; diff --git a/fs/ecryptfs/netlink.c b/fs/ecryptfs/netlink.c deleted file mode 100644 index e0abad62b395..000000000000 --- a/fs/ecryptfs/netlink.c +++ /dev/null @@ -1,249 +0,0 @@ -/** - * eCryptfs: Linux filesystem encryption layer - * - * Copyright (C) 2004-2006 International Business Machines Corp. - * Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com> - * Tyler Hicks <tyhicks@ou.edu> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA - * 02111-1307, USA. - */ - -#include <net/sock.h> -#include <linux/hash.h> -#include <linux/random.h> -#include "ecryptfs_kernel.h" - -static struct sock *ecryptfs_nl_sock; - -/** - * ecryptfs_send_netlink - * @data: The data to include as the payload - * @data_len: The byte count of the data - * @msg_ctx: The netlink context that will be used to handle the - * response message - * @msg_type: The type of netlink message to send - * @msg_flags: The flags to include in the netlink header - * @daemon_pid: The process id of the daemon to send the message to - * - * Sends the data to the specified daemon pid and uses the netlink - * context element to store the data needed for validation upon - * receiving the response. The data and the netlink context can be - * null if just sending a netlink header is sufficient. Returns zero - * upon sending the message; non-zero upon error. - */ -int ecryptfs_send_netlink(char *data, int data_len, - struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type, - u16 msg_flags, struct pid *daemon_pid) -{ - struct sk_buff *skb; - struct nlmsghdr *nlh; - struct ecryptfs_message *msg; - size_t payload_len; - int rc; - - payload_len = ((data && data_len) ? (sizeof(*msg) + data_len) : 0); - skb = alloc_skb(NLMSG_SPACE(payload_len), GFP_KERNEL); - if (!skb) { - rc = -ENOMEM; - ecryptfs_printk(KERN_ERR, "Failed to allocate socket buffer\n"); - goto out; - } - nlh = NLMSG_PUT(skb, pid_nr(daemon_pid), msg_ctx ? msg_ctx->counter : 0, - msg_type, payload_len); - nlh->nlmsg_flags = msg_flags; - if (msg_ctx && payload_len) { - msg = (struct ecryptfs_message *)NLMSG_DATA(nlh); - msg->index = msg_ctx->index; - msg->data_len = data_len; - memcpy(msg->data, data, data_len); - } - rc = netlink_unicast(ecryptfs_nl_sock, skb, pid_nr(daemon_pid), 0); - if (rc < 0) { - ecryptfs_printk(KERN_ERR, "Failed to send eCryptfs netlink " - "message; rc = [%d]\n", rc); - goto out; - } - rc = 0; - goto out; -nlmsg_failure: - rc = -EMSGSIZE; - kfree_skb(skb); -out: - return rc; -} - -/** - * ecryptfs_process_nl_reponse - * @skb: The socket buffer containing the netlink message of state - * RESPONSE - * - * Processes a response message after sending a operation request to - * userspace. Attempts to assign the msg to a netlink context element - * at the index specified in the msg. The sk_buff and nlmsghdr must - * be validated before this function. Returns zero upon delivery to - * desired context element; non-zero upon delivery failure or error. - */ -static int ecryptfs_process_nl_response(struct sk_buff *skb) -{ - struct nlmsghdr *nlh = nlmsg_hdr(skb); - struct ecryptfs_message *msg = NLMSG_DATA(nlh); - struct pid *pid; - int rc; - - if (skb->len - NLMSG_HDRLEN - sizeof(*msg) != msg->data_len) { - rc = -EINVAL; - ecryptfs_printk(KERN_ERR, "Received netlink message with " - "incorrectly specified data length\n"); - goto out; - } - pid = find_get_pid(NETLINK_CREDS(skb)->pid); - rc = ecryptfs_process_response(msg, NETLINK_CREDS(skb)->uid, NULL, - pid, nlh->nlmsg_seq); - put_pid(pid); - if (rc) - printk(KERN_ERR - "Error processing response message; rc = [%d]\n", rc); -out: - return rc; -} - -/** - * ecryptfs_process_nl_helo - * @skb: The socket buffer containing the nlmsghdr in HELO state - * - * Gets uid and pid of the skb and adds the values to the daemon id - * hash. Returns zero after adding a new daemon id to the hash list; - * non-zero otherwise. - */ -static int ecryptfs_process_nl_helo(struct sk_buff *skb) -{ - struct pid *pid; - int rc; - - pid = find_get_pid(NETLINK_CREDS(skb)->pid); - rc = ecryptfs_process_helo(ECRYPTFS_TRANSPORT_NETLINK, - NETLINK_CREDS(skb)->uid, NULL, pid); - put_pid(pid); - if (rc) - printk(KERN_WARNING "Error processing HELO; rc = [%d]\n", rc); - return rc; -} - -/** - * ecryptfs_process_nl_quit - * @skb: The socket buffer containing the nlmsghdr in QUIT state - * - * Gets uid and pid of the skb and deletes the corresponding daemon - * id, if it is the registered that is requesting the - * deletion. Returns zero after deleting the desired daemon id; - * non-zero otherwise. - */ -static int ecryptfs_process_nl_quit(struct sk_buff *skb) -{ - struct pid *pid; - int rc; - - pid = find_get_pid(NETLINK_CREDS(skb)->pid); - rc = ecryptfs_process_quit(NETLINK_CREDS(skb)->uid, NULL, pid); - put_pid(pid); - if (rc) - printk(KERN_WARNING - "Error processing QUIT message; rc = [%d]\n", rc); - return rc; -} - -/** - * ecryptfs_receive_nl_message - * - * Callback function called by netlink system when a message arrives. - * If the message looks to be valid, then an attempt is made to assign - * it to its desired netlink context element and wake up the process - * that is waiting for a response. - */ -static void ecryptfs_receive_nl_message(struct sk_buff *skb) -{ - struct nlmsghdr *nlh; - - nlh = nlmsg_hdr(skb); - if (!NLMSG_OK(nlh, skb->len)) { - ecryptfs_printk(KERN_ERR, "Received corrupt netlink " - "message\n"); - goto free; - } - switch (nlh->nlmsg_type) { - case ECRYPTFS_MSG_RESPONSE: - if (ecryptfs_process_nl_response(skb)) { - ecryptfs_printk(KERN_WARNING, "Failed to " - "deliver netlink response to " - "requesting operation\n"); - } - break; - case ECRYPTFS_MSG_HELO: - if (ecryptfs_process_nl_helo(skb)) { - ecryptfs_printk(KERN_WARNING, "Failed to " - "fulfill HELO request\n"); - } - break; - case ECRYPTFS_MSG_QUIT: - if (ecryptfs_process_nl_quit(skb)) { - ecryptfs_printk(KERN_WARNING, "Failed to " - "fulfill QUIT request\n"); - } - break; - default: - ecryptfs_printk(KERN_WARNING, "Dropping netlink " - "message of unrecognized type [%d]\n", - nlh->nlmsg_type); - break; - } -free: - kfree_skb(skb); -} - -/** - * ecryptfs_init_netlink - * - * Initializes the daemon id hash list, netlink context array, and - * necessary locks. Returns zero upon success; non-zero upon error. - */ -int ecryptfs_init_netlink(void) -{ - int rc; - - ecryptfs_nl_sock = netlink_kernel_create(&init_net, NETLINK_ECRYPTFS, 0, - ecryptfs_receive_nl_message, - NULL, THIS_MODULE); - if (!ecryptfs_nl_sock) { - rc = -EIO; - ecryptfs_printk(KERN_ERR, "Failed to create netlink socket\n"); - goto out; - } - ecryptfs_nl_sock->sk_sndtimeo = ECRYPTFS_DEFAULT_SEND_TIMEOUT; - rc = 0; -out: - return rc; -} - -/** - * ecryptfs_release_netlink - * - * Frees all memory used by the netlink context array and releases the - * netlink socket. - */ -void ecryptfs_release_netlink(void) -{ - netlink_kernel_release(ecryptfs_nl_sock); - ecryptfs_nl_sock = NULL; -} diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 7cc0eb756b55..99368bda0261 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -927,14 +927,11 @@ errxit: /* * During the time we spent in the loop above, some other events * might have been queued by the poll callback. We re-insert them - * here (in case they are not already queued, or they're one-shot). + * inside the main ready-list here. */ for (nepi = ep->ovflist; (epi = nepi) != NULL; - nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { - if (!ep_is_linked(&epi->rdllink) && - (epi->event.events & ~EP_PRIVATE_BITS)) - list_add_tail(&epi->rdllink, &ep->rdllist); - } + nepi = epi->next, epi->next = EP_UNACTIVE_PTR) + list_add_tail(&epi->rdllink, &ep->rdllist); /* * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after * releasing the lock, events will be queued in the normal way inside diff --git a/fs/exec.c b/fs/exec.c index cecee501ce78..4e834f16d9da 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -50,15 +50,12 @@ #include <linux/cn_proc.h> #include <linux/audit.h> #include <linux/tracehook.h> +#include <linux/kmod.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> #include <asm/tlb.h> -#ifdef CONFIG_KMOD -#include <linux/kmod.h> -#endif - #ifdef __alpha__ /* for /sbin/loader handling in search_binary_handler() */ #include <linux/a.out.h> @@ -391,7 +388,7 @@ static int count(char __user * __user * argv, int max) if (!p) break; argv++; - if(++i > max) + if (i++ >= max) return -E2BIG; cond_resched(); } @@ -825,8 +822,6 @@ static int de_thread(struct task_struct *tsk) schedule(); } - if (unlikely(task_child_reaper(tsk) == leader)) - task_active_pid_ns(tsk)->child_reaper = tsk; /* * The only record we have of the real-time age of a * process, regardless of execs it's done, is start_time. @@ -1189,7 +1184,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) return retval; /* Remember if the application is TASO. */ - bprm->sh_bang = eh->ah.entry < 0x100000000UL; + bprm->taso = eh->ah.entry < 0x100000000UL; bprm->file = file; bprm->loader = loader; @@ -1247,8 +1242,8 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) read_unlock(&binfmt_lock); if (retval != -ENOEXEC || bprm->mm == NULL) { break; -#ifdef CONFIG_KMOD - }else{ +#ifdef CONFIG_MODULES + } else { #define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) if (printable(bprm->buf[0]) && printable(bprm->buf[1]) && @@ -1391,7 +1386,7 @@ EXPORT_SYMBOL(set_binfmt); * name into corename, which must have space for at least * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. */ -static int format_corename(char *corename, int nr_threads, long signr) +static int format_corename(char *corename, long signr) { const char *pat_ptr = core_pattern; int ispipe = (*pat_ptr == '|'); @@ -1498,8 +1493,7 @@ static int format_corename(char *corename, int nr_threads, long signr) * If core_pattern does not include a %p (as is the default) * and core_uses_pid is set, then .%pid will be appended to * the filename. Do not do this for piped commands. */ - if (!ispipe && !pid_in_pattern - && (core_uses_pid || nr_threads)) { + if (!ispipe && !pid_in_pattern && core_uses_pid) { rc = snprintf(out_ptr, out_end - out_ptr, ".%d", task_tgid_vnr(current)); if (rc > out_end - out_ptr) @@ -1762,7 +1756,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) * uses lock_kernel() */ lock_kernel(); - ispipe = format_corename(corename, retval, signr); + ispipe = format_corename(corename, signr); unlock_kernel(); /* * Don't bother to check the RLIMIT_CORE value if core_pattern points diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig new file mode 100644 index 000000000000..14a6780fd034 --- /dev/null +++ b/fs/ext2/Kconfig @@ -0,0 +1,55 @@ +config EXT2_FS + tristate "Second extended fs support" + help + Ext2 is a standard Linux file system for hard disks. + + To compile this file system support as a module, choose M here: the + module will be called ext2. + + If unsure, say Y. + +config EXT2_FS_XATTR + bool "Ext2 extended attributes" + depends on EXT2_FS + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + <http://acl.bestbits.at/> for details). + + If unsure, say N. + +config EXT2_FS_POSIX_ACL + bool "Ext2 POSIX Access Control Lists" + depends on EXT2_FS_XATTR + select FS_POSIX_ACL + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the Posix ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N + +config EXT2_FS_SECURITY + bool "Ext2 Security Labels" + depends on EXT2_FS_XATTR + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the ext2 filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. + +config EXT2_FS_XIP + bool "Ext2 execute in place support" + depends on EXT2_FS && MMU + help + Execute in place can be used on memory-backed block devices. If you + enable this option, you can select to mount block devices which are + capable of this feature without using the page cache. + + If you do not use a block device that is capable of using this, + or if unsure, say N. diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index 10bb02c3f25c..6dac7ba2d22d 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -1295,6 +1295,7 @@ retry_alloc: * turn off reservation for this allocation */ if (my_rsv && (free_blocks < windowsz) + && (free_blocks > 0) && (rsv_is_empty(&my_rsv->rsv_window))) my_rsv = NULL; @@ -1332,7 +1333,7 @@ retry_alloc: * free blocks is less than half of the reservation * window size. */ - if (free_blocks <= (windowsz/2)) + if (my_rsv && (free_blocks <= (windowsz/2))) continue; brelse(bitmap_bh); diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index a78c6b4af060..11a49ce84392 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -103,7 +103,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len) return err; } -static void ext2_check_page(struct page *page) +static void ext2_check_page(struct page *page, int quiet) { struct inode *dir = page->mapping->host; struct super_block *sb = dir->i_sb; @@ -146,10 +146,10 @@ out: /* Too bad, we had an error */ Ebadsize: - ext2_error(sb, "ext2_check_page", - "size of directory #%lu is not a multiple of chunk size", - dir->i_ino - ); + if (!quiet) + ext2_error(sb, __func__, + "size of directory #%lu is not a multiple " + "of chunk size", dir->i_ino); goto fail; Eshort: error = "rec_len is smaller than minimal"; @@ -166,32 +166,36 @@ Espan: Einumber: error = "inode out of bounds"; bad_entry: - ext2_error (sb, "ext2_check_page", "bad entry in directory #%lu: %s - " - "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", - dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs, - (unsigned long) le32_to_cpu(p->inode), - rec_len, p->name_len); + if (!quiet) + ext2_error(sb, __func__, "bad entry in directory #%lu: : %s - " + "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", + dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs, + (unsigned long) le32_to_cpu(p->inode), + rec_len, p->name_len); goto fail; Eend: - p = (ext2_dirent *)(kaddr + offs); - ext2_error (sb, "ext2_check_page", - "entry in directory #%lu spans the page boundary" - "offset=%lu, inode=%lu", - dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs, - (unsigned long) le32_to_cpu(p->inode)); + if (!quiet) { + p = (ext2_dirent *)(kaddr + offs); + ext2_error(sb, "ext2_check_page", + "entry in directory #%lu spans the page boundary" + "offset=%lu, inode=%lu", + dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs, + (unsigned long) le32_to_cpu(p->inode)); + } fail: SetPageChecked(page); SetPageError(page); } -static struct page * ext2_get_page(struct inode *dir, unsigned long n) +static struct page * ext2_get_page(struct inode *dir, unsigned long n, + int quiet) { struct address_space *mapping = dir->i_mapping; struct page *page = read_mapping_page(mapping, n, NULL); if (!IS_ERR(page)) { kmap(page); if (!PageChecked(page)) - ext2_check_page(page); + ext2_check_page(page, quiet); if (PageError(page)) goto fail; } @@ -292,7 +296,7 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) for ( ; n < npages; n++, offset = 0) { char *kaddr, *limit; ext2_dirent *de; - struct page *page = ext2_get_page(inode, n); + struct page *page = ext2_get_page(inode, n, 0); if (IS_ERR(page)) { ext2_error(sb, __func__, @@ -361,6 +365,7 @@ struct ext2_dir_entry_2 * ext2_find_entry (struct inode * dir, struct page *page = NULL; struct ext2_inode_info *ei = EXT2_I(dir); ext2_dirent * de; + int dir_has_error = 0; if (npages == 0) goto out; @@ -374,7 +379,7 @@ struct ext2_dir_entry_2 * ext2_find_entry (struct inode * dir, n = start; do { char *kaddr; - page = ext2_get_page(dir, n); + page = ext2_get_page(dir, n, dir_has_error); if (!IS_ERR(page)) { kaddr = page_address(page); de = (ext2_dirent *) kaddr; @@ -391,7 +396,9 @@ struct ext2_dir_entry_2 * ext2_find_entry (struct inode * dir, de = ext2_next_entry(de); } ext2_put_page(page); - } + } else + dir_has_error = 1; + if (++n >= npages) n = 0; /* next page is past the blocks we've got */ @@ -414,7 +421,7 @@ found: struct ext2_dir_entry_2 * ext2_dotdot (struct inode *dir, struct page **p) { - struct page *page = ext2_get_page(dir, 0); + struct page *page = ext2_get_page(dir, 0, 0); ext2_dirent *de = NULL; if (!IS_ERR(page)) { @@ -487,7 +494,7 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode) for (n = 0; n <= npages; n++) { char *dir_end; - page = ext2_get_page(dir, n); + page = ext2_get_page(dir, n, 0); err = PTR_ERR(page); if (IS_ERR(page)) goto out; @@ -655,14 +662,17 @@ int ext2_empty_dir (struct inode * inode) { struct page *page = NULL; unsigned long i, npages = dir_pages(inode); + int dir_has_error = 0; for (i = 0; i < npages; i++) { char *kaddr; ext2_dirent * de; - page = ext2_get_page(inode, i); + page = ext2_get_page(inode, i, dir_has_error); - if (IS_ERR(page)) + if (IS_ERR(page)) { + dir_has_error = 1; continue; + } kaddr = page_address(page); de = (ext2_dirent *)kaddr; diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig new file mode 100644 index 000000000000..8e0cfe44b0fc --- /dev/null +++ b/fs/ext3/Kconfig @@ -0,0 +1,67 @@ +config EXT3_FS + tristate "Ext3 journalling file system support" + select JBD + help + This is the journalling version of the Second extended file system + (often called ext3), the de facto standard Linux file system + (method to organize files on a storage device) for hard disks. + + The journalling code included in this driver means you do not have + to run e2fsck (file system checker) on your file systems after a + crash. The journal keeps track of any changes that were being made + at the time the system crashed, and can ensure that your file system + is consistent without the need for a lengthy check. + + Other than adding the journal to the file system, the on-disk format + of ext3 is identical to ext2. It is possible to freely switch + between using the ext3 driver and the ext2 driver, as long as the + file system has been cleanly unmounted, or e2fsck is run on the file + system. + + To add a journal on an existing ext2 file system or change the + behavior of ext3 file systems, you can use the tune2fs utility ("man + tune2fs"). To modify attributes of files and directories on ext3 + file systems, use chattr ("man chattr"). You need to be using + e2fsprogs version 1.20 or later in order to create ext3 journals + (available at <http://sourceforge.net/projects/e2fsprogs/>). + + To compile this file system support as a module, choose M here: the + module will be called ext3. + +config EXT3_FS_XATTR + bool "Ext3 extended attributes" + depends on EXT3_FS + default y + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + <http://acl.bestbits.at/> for details). + + If unsure, say N. + + You need this for POSIX ACL support on ext3. + +config EXT3_FS_POSIX_ACL + bool "Ext3 POSIX Access Control Lists" + depends on EXT3_FS_XATTR + select FS_POSIX_ACL + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the Posix ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N + +config EXT3_FS_SECURITY + bool "Ext3 Security Labels" + depends on EXT3_FS_XATTR + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the ext3 filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index 92fd0338a6eb..f5b57a2ca35a 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -1547,6 +1547,7 @@ retry_alloc: * turn off reservation for this allocation */ if (my_rsv && (free_blocks < windowsz) + && (free_blocks > 0) && (rsv_is_empty(&my_rsv->rsv_window))) my_rsv = NULL; @@ -1585,7 +1586,7 @@ retry_alloc: * free blocks is less than half of the reservation * window size. */ - if (free_blocks <= (windowsz/2)) + if (my_rsv && (free_blocks <= (windowsz/2))) continue; brelse(bitmap_bh); diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index 2eea96ec78ed..4c82531ea0a8 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -102,6 +102,7 @@ static int ext3_readdir(struct file * filp, int err; struct inode *inode = filp->f_path.dentry->d_inode; int ret = 0; + int dir_has_error = 0; sb = inode->i_sb; @@ -148,9 +149,12 @@ static int ext3_readdir(struct file * filp, * of recovering data when there's a bad sector */ if (!bh) { - ext3_error (sb, "ext3_readdir", - "directory #%lu contains a hole at offset %lu", - inode->i_ino, (unsigned long)filp->f_pos); + if (!dir_has_error) { + ext3_error(sb, __func__, "directory #%lu " + "contains a hole at offset %lld", + inode->i_ino, filp->f_pos); + dir_has_error = 1; + } /* corrupt size? Maybe no more blocks to read */ if (filp->f_pos > inode->i_blocks << 9) break; @@ -410,7 +414,7 @@ static int call_filldir(struct file * filp, void * dirent, get_dtype(sb, fname->file_type)); if (error) { filp->f_pos = curr_pos; - info->extra_fname = fname->next; + info->extra_fname = fname; return error; } fname = fname->next; @@ -449,11 +453,21 @@ static int ext3_dx_readdir(struct file * filp, * If there are any leftover names on the hash collision * chain, return them first. */ - if (info->extra_fname && - call_filldir(filp, dirent, filldir, info->extra_fname)) - goto finished; + if (info->extra_fname) { + if (call_filldir(filp, dirent, filldir, info->extra_fname)) + goto finished; - if (!info->curr_node) + info->extra_fname = NULL; + info->curr_node = rb_next(info->curr_node); + if (!info->curr_node) { + if (info->next_hash == ~0) { + filp->f_pos = EXT3_HTREE_EOF; + goto finished; + } + info->curr_hash = info->next_hash; + info->curr_minor_hash = 0; + } + } else if (!info->curr_node) info->curr_node = rb_first(&info->root); while (1) { diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index ebfec4d0148e..f8424ad89971 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1186,6 +1186,13 @@ write_begin_failed: ext3_journal_stop(handle); unlock_page(page); page_cache_release(page); + /* + * block_write_begin may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + */ + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); } if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) goto retry; diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 77278e947e94..78fdf3836370 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -790,7 +790,8 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) if (reserved_gdb || gdb_off == 0) { if (!EXT3_HAS_COMPAT_FEATURE(sb, - EXT3_FEATURE_COMPAT_RESIZE_INODE)){ + EXT3_FEATURE_COMPAT_RESIZE_INODE) + || !le16_to_cpu(es->s_reserved_gdt_blocks)) { ext3_warning(sb, __func__, "No reserved GDT blocks, can't resize"); return -EPERM; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 399a96a6c556..3a260af5544d 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -625,6 +625,9 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA) seq_puts(seq, ",data=writeback"); + if (test_opt(sb, DATA_ERR_ABORT)) + seq_puts(seq, ",data_err=abort"); + ext3_show_quota_options(seq, sb); return 0; @@ -754,6 +757,7 @@ enum { Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, @@ -796,6 +800,8 @@ static const match_table_t tokens = { {Opt_data_journal, "data=journal"}, {Opt_data_ordered, "data=ordered"}, {Opt_data_writeback, "data=writeback"}, + {Opt_data_err_abort, "data_err=abort"}, + {Opt_data_err_ignore, "data_err=ignore"}, {Opt_offusrjquota, "usrjquota="}, {Opt_usrjquota, "usrjquota=%s"}, {Opt_offgrpjquota, "grpjquota="}, @@ -1011,6 +1017,12 @@ static int parse_options (char *options, struct super_block *sb, sbi->s_mount_opt |= data_opt; } break; + case Opt_data_err_abort: + set_opt(sbi->s_mount_opt, DATA_ERR_ABORT); + break; + case Opt_data_err_ignore: + clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); + break; #ifdef CONFIG_QUOTA case Opt_usrjquota: qtype = USRQUOTA; @@ -1986,6 +1998,10 @@ static void ext3_init_journal_params(struct super_block *sb, journal_t *journal) journal->j_flags |= JFS_BARRIER; else journal->j_flags &= ~JFS_BARRIER; + if (test_opt(sb, DATA_ERR_ABORT)) + journal->j_flags |= JFS_ABORT_ON_SYNCDATA_ERR; + else + journal->j_flags &= ~JFS_ABORT_ON_SYNCDATA_ERR; spin_unlock(&journal->j_state_lock); } diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig new file mode 100644 index 000000000000..7505482a08fa --- /dev/null +++ b/fs/ext4/Kconfig @@ -0,0 +1,79 @@ +config EXT4_FS + tristate "The Extended 4 (ext4) filesystem" + select JBD2 + select CRC16 + help + This is the next generation of the ext3 filesystem. + + Unlike the change from ext2 filesystem to ext3 filesystem, + the on-disk format of ext4 is not forwards compatible with + ext3; it is based on extent maps and it supports 48-bit + physical block numbers. The ext4 filesystem also supports delayed + allocation, persistent preallocation, high resolution time stamps, + and a number of other features to improve performance and speed + up fsck time. For more information, please see the web pages at + http://ext4.wiki.kernel.org. + + The ext4 filesystem will support mounting an ext3 + filesystem; while there will be some performance gains from + the delayed allocation and inode table readahead, the best + performance gains will require enabling ext4 features in the + filesystem, or formating a new filesystem as an ext4 + filesystem initially. + + To compile this file system support as a module, choose M here. The + module will be called ext4. + + If unsure, say N. + +config EXT4DEV_COMPAT + bool "Enable ext4dev compatibility" + depends on EXT4_FS + help + Starting with 2.6.28, the name of the ext4 filesystem was + renamed from ext4dev to ext4. Unfortunately there are some + legacy userspace programs (such as klibc's fstype) have + "ext4dev" hardcoded. + + To enable backwards compatibility so that systems that are + still expecting to mount ext4 filesystems using ext4dev, + chose Y here. This feature will go away by 2.6.31, so + please arrange to get your userspace programs fixed! + +config EXT4_FS_XATTR + bool "Ext4 extended attributes" + depends on EXT4_FS + default y + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + <http://acl.bestbits.at/> for details). + + If unsure, say N. + + You need this for POSIX ACL support on ext4. + +config EXT4_FS_POSIX_ACL + bool "Ext4 POSIX Access Control Lists" + depends on EXT4_FS_XATTR + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N + +config EXT4_FS_SECURITY + bool "Ext4 Security Labels" + depends on EXT4_FS_XATTR + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the ext4 filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index bd2ece228827..b9821be709bd 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -568,8 +568,16 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, /* this isn't the right place to decide whether block is metadata * inode.c/extents.c knows better, but for safety ... */ - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || - ext4_should_journal_data(inode)) + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + metadata = 1; + + /* We need to make sure we don't reuse + * block released untill the transaction commit. + * writeback mode have weak data consistency so + * don't force data as metadata when freeing block + * for writeback mode. + */ + if (metadata == 0 && !ext4_should_writeback_data(inode)) metadata = 1; sb = inode->i_sb; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6690a41cdd9f..4880cc3e6727 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -511,7 +511,6 @@ do { \ /* * Mount flags */ -#define EXT4_MOUNT_CHECK 0x00001 /* Do mount-time checks */ #define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */ #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h index 6a0b40d43264..445fde603df8 100644 --- a/fs/ext4/ext4_sb.h +++ b/fs/ext4/ext4_sb.h @@ -99,9 +99,6 @@ struct ext4_sb_info { struct inode *s_buddy_cache; long s_blocks_reserved; spinlock_t s_reserve_lock; - struct list_head s_active_transaction; - struct list_head s_closed_transaction; - struct list_head s_committed_transaction; spinlock_t s_md_lock; tid_t s_last_transaction; unsigned short *s_mb_offsets, *s_mb_maxs; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9b4ec9decfd1..8dbf6953845b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1648,6 +1648,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) int ret = 0, err, nr_pages, i; unsigned long index, end; struct pagevec pvec; + long pages_skipped; BUG_ON(mpd->next_page <= mpd->first_page); pagevec_init(&pvec, 0); @@ -1655,20 +1656,30 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) end = mpd->next_page - 1; while (index <= end) { - /* XXX: optimize tail */ - nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); + /* + * We can use PAGECACHE_TAG_DIRTY lookup here because + * even though we have cleared the dirty flag on the page + * We still keep the page in the radix tree with tag + * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io. + * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback + * which is called via the below writepage callback. + */ + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, + (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - index = page->index; - if (index > end) - break; - index++; - + pages_skipped = mpd->wbc->pages_skipped; err = mapping->a_ops->writepage(page, mpd->wbc); - if (!err) + if (!err && (pages_skipped == mpd->wbc->pages_skipped)) + /* + * have successfully written the page + * without skipping the same + */ mpd->pages_written++; /* * In error case, we have to continue because @@ -2104,7 +2115,6 @@ static int mpage_da_writepages(struct address_space *mapping, struct writeback_control *wbc, struct mpage_da_data *mpd) { - long to_write; int ret; if (!mpd->get_block) @@ -2119,19 +2129,18 @@ static int mpage_da_writepages(struct address_space *mapping, mpd->pages_written = 0; mpd->retval = 0; - to_write = wbc->nr_to_write; - ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd); - /* * Handle last extent of pages */ if (!mpd->io_done && mpd->next_page != mpd->first_page) { if (mpage_da_map_blocks(mpd) == 0) mpage_da_submit_io(mpd); - } - wbc->nr_to_write = to_write - mpd->pages_written; + mpd->io_done = 1; + ret = MPAGE_DA_EXTENT_TAIL; + } + wbc->nr_to_write -= mpd->pages_written; return ret; } @@ -2360,12 +2369,14 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) static int ext4_da_writepages(struct address_space *mapping, struct writeback_control *wbc) { + pgoff_t index; + int range_whole = 0; handle_t *handle = NULL; - loff_t range_start = 0; struct mpage_da_data mpd; struct inode *inode = mapping->host; + int no_nrwrite_index_update; + long pages_written = 0, pages_skipped; int needed_blocks, ret = 0, nr_to_writebump = 0; - long to_write, pages_skipped = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); /* @@ -2385,23 +2396,26 @@ static int ext4_da_writepages(struct address_space *mapping, nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; wbc->nr_to_write = sbi->s_mb_stream_request; } + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; - if (!wbc->range_cyclic) - /* - * If range_cyclic is not set force range_cont - * and save the old writeback_index - */ - wbc->range_cont = 1; - - range_start = wbc->range_start; - pages_skipped = wbc->pages_skipped; + if (wbc->range_cyclic) + index = mapping->writeback_index; + else + index = wbc->range_start >> PAGE_CACHE_SHIFT; mpd.wbc = wbc; mpd.inode = mapping->host; -restart_loop: - to_write = wbc->nr_to_write; - while (!ret && to_write > 0) { + /* + * we don't want write_cache_pages to update + * nr_to_write and writeback_index + */ + no_nrwrite_index_update = wbc->no_nrwrite_index_update; + wbc->no_nrwrite_index_update = 1; + pages_skipped = wbc->pages_skipped; + + while (!ret && wbc->nr_to_write > 0) { /* * we insert one extent at a time. So we need @@ -2422,48 +2436,53 @@ restart_loop: dump_stack(); goto out_writepages; } - to_write -= wbc->nr_to_write; - mpd.get_block = ext4_da_get_block_write; ret = mpage_da_writepages(mapping, wbc, &mpd); ext4_journal_stop(handle); - if (mpd.retval == -ENOSPC) + if (mpd.retval == -ENOSPC) { + /* commit the transaction which would + * free blocks released in the transaction + * and try again + */ jbd2_journal_force_commit_nested(sbi->s_journal); - - /* reset the retry count */ - if (ret == MPAGE_DA_EXTENT_TAIL) { + wbc->pages_skipped = pages_skipped; + ret = 0; + } else if (ret == MPAGE_DA_EXTENT_TAIL) { /* * got one extent now try with * rest of the pages */ - to_write += wbc->nr_to_write; + pages_written += mpd.pages_written; + wbc->pages_skipped = pages_skipped; ret = 0; - } else if (wbc->nr_to_write) { + } else if (wbc->nr_to_write) /* * There is no more writeout needed * or we requested for a noblocking writeout * and we found the device congested */ - to_write += wbc->nr_to_write; break; - } - wbc->nr_to_write = to_write; - } - - if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) { - /* We skipped pages in this loop */ - wbc->range_start = range_start; - wbc->nr_to_write = to_write + - wbc->pages_skipped - pages_skipped; - wbc->pages_skipped = pages_skipped; - goto restart_loop; } + if (pages_skipped != wbc->pages_skipped) + printk(KERN_EMERG "This should not happen leaving %s " + "with nr_to_write = %ld ret = %d\n", + __func__, wbc->nr_to_write, ret); + + /* Update index */ + index += pages_written; + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + /* + * set the writeback_index so that range_cyclic + * mode will write it back later + */ + mapping->writeback_index = index; out_writepages: - wbc->nr_to_write = to_write - nr_to_writebump; - wbc->range_start = range_start; + if (!no_nrwrite_index_update) + wbc->no_nrwrite_index_update = 0; + wbc->nr_to_write -= nr_to_writebump; return ret; } @@ -4175,7 +4194,6 @@ static int ext4_inode_blocks_set(handle_t *handle, struct inode *inode = &(ei->vfs_inode); u64 i_blocks = inode->i_blocks; struct super_block *sb = inode->i_sb; - int err = 0; if (i_blocks <= ~0U) { /* @@ -4185,36 +4203,27 @@ static int ext4_inode_blocks_set(handle_t *handle, raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = 0; ei->i_flags &= ~EXT4_HUGE_FILE_FL; - } else if (i_blocks <= 0xffffffffffffULL) { + return 0; + } + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) + return -EFBIG; + + if (i_blocks <= 0xffffffffffffULL) { /* * i_blocks can be represented in a 48 bit variable * as multiple of 512 bytes */ - err = ext4_update_rocompat_feature(handle, sb, - EXT4_FEATURE_RO_COMPAT_HUGE_FILE); - if (err) - goto err_out; - /* i_block is stored in the split 48 bit fields */ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); ei->i_flags &= ~EXT4_HUGE_FILE_FL; } else { - /* - * i_blocks should be represented in a 48 bit variable - * as multiple of file system block size - */ - err = ext4_update_rocompat_feature(handle, sb, - EXT4_FEATURE_RO_COMPAT_HUGE_FILE); - if (err) - goto err_out; ei->i_flags |= EXT4_HUGE_FILE_FL; /* i_block is stored in file system block size */ i_blocks = i_blocks >> (inode->i_blkbits - 9); raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); } -err_out: - return err; + return 0; } /* diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b580714f0d85..dfe17a134052 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2300,6 +2300,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, } INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); + meta_group_info[i]->bb_free_root.rb_node = NULL;; #ifdef DOUBLE_CHECK { @@ -2522,9 +2523,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) } spin_lock_init(&sbi->s_md_lock); - INIT_LIST_HEAD(&sbi->s_active_transaction); - INIT_LIST_HEAD(&sbi->s_closed_transaction); - INIT_LIST_HEAD(&sbi->s_committed_transaction); spin_lock_init(&sbi->s_bal_lock); sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; @@ -2553,6 +2551,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) ext4_mb_init_per_dev_proc(sb); ext4_mb_history_init(sb); + sbi->s_journal->j_commit_callback = release_blocks_on_commit; + printk(KERN_INFO "EXT4-fs: mballoc enabled\n"); return 0; } @@ -2568,7 +2568,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); list_del(&pa->pa_group_list); count++; - kfree(pa); + kmem_cache_free(ext4_pspace_cachep, pa); } if (count) mb_debug("mballoc: %u PAs left\n", count); @@ -2582,15 +2582,6 @@ int ext4_mb_release(struct super_block *sb) struct ext4_group_info *grinfo; struct ext4_sb_info *sbi = EXT4_SB(sb); - /* release freed, non-committed blocks */ - spin_lock(&sbi->s_md_lock); - list_splice_init(&sbi->s_closed_transaction, - &sbi->s_committed_transaction); - list_splice_init(&sbi->s_active_transaction, - &sbi->s_committed_transaction); - spin_unlock(&sbi->s_md_lock); - ext4_mb_free_committed_blocks(sb); - if (sbi->s_group_info) { for (i = 0; i < sbi->s_groups_count; i++) { grinfo = ext4_get_group_info(sb, i); @@ -2644,61 +2635,57 @@ int ext4_mb_release(struct super_block *sb) return 0; } -static noinline_for_stack void -ext4_mb_free_committed_blocks(struct super_block *sb) +/* + * This function is called by the jbd2 layer once the commit has finished, + * so we know we can free the blocks that were released with that commit. + */ +static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) { - struct ext4_sb_info *sbi = EXT4_SB(sb); - int err; - int i; - int count = 0; - int count2 = 0; - struct ext4_free_metadata *md; + struct super_block *sb = journal->j_private; struct ext4_buddy e4b; + struct ext4_group_info *db; + int err, count = 0, count2 = 0; + struct ext4_free_data *entry; + ext4_fsblk_t discard_block; + struct list_head *l, *ltmp; - if (list_empty(&sbi->s_committed_transaction)) - return; - - /* there is committed blocks to be freed yet */ - do { - /* get next array of blocks */ - md = NULL; - spin_lock(&sbi->s_md_lock); - if (!list_empty(&sbi->s_committed_transaction)) { - md = list_entry(sbi->s_committed_transaction.next, - struct ext4_free_metadata, list); - list_del(&md->list); - } - spin_unlock(&sbi->s_md_lock); - - if (md == NULL) - break; + list_for_each_safe(l, ltmp, &txn->t_private_list) { + entry = list_entry(l, struct ext4_free_data, list); mb_debug("gonna free %u blocks in group %lu (0x%p):", - md->num, md->group, md); + entry->count, entry->group, entry); - err = ext4_mb_load_buddy(sb, md->group, &e4b); + err = ext4_mb_load_buddy(sb, entry->group, &e4b); /* we expect to find existing buddy because it's pinned */ BUG_ON(err != 0); + db = e4b.bd_info; /* there are blocks to put in buddy to make them really free */ - count += md->num; + count += entry->count; count2++; - ext4_lock_group(sb, md->group); - for (i = 0; i < md->num; i++) { - mb_debug(" %u", md->blocks[i]); - mb_free_blocks(NULL, &e4b, md->blocks[i], 1); + ext4_lock_group(sb, entry->group); + /* Take it out of per group rb tree */ + rb_erase(&entry->node, &(db->bb_free_root)); + mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); + + if (!db->bb_free_root.rb_node) { + /* No more items in the per group rb tree + * balance refcounts from ext4_mb_free_metadata() + */ + page_cache_release(e4b.bd_buddy_page); + page_cache_release(e4b.bd_bitmap_page); } - mb_debug("\n"); - ext4_unlock_group(sb, md->group); - - /* balance refcounts from ext4_mb_free_metadata() */ - page_cache_release(e4b.bd_buddy_page); - page_cache_release(e4b.bd_bitmap_page); - - kfree(md); + ext4_unlock_group(sb, entry->group); + discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) + + entry->start_blk + + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); + trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id, + (unsigned long long) discard_block, entry->count); + sb_issue_discard(sb, discard_block, entry->count); + + kmem_cache_free(ext4_free_ext_cachep, entry); ext4_mb_release_desc(&e4b); - - } while (md); + } mb_debug("freed %u blocks in %u structures\n", count, count2); } @@ -2712,6 +2699,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb) static int ext4_mb_init_per_dev_proc(struct super_block *sb) { +#ifdef CONFIG_PROC_FS mode_t mode = S_IFREG | S_IRUGO | S_IWUSR; struct ext4_sb_info *sbi = EXT4_SB(sb); struct proc_dir_entry *proc; @@ -2735,10 +2723,14 @@ err_out: remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); return -ENOMEM; +#else + return 0; +#endif } static int ext4_mb_destroy_per_dev_proc(struct super_block *sb) { +#ifdef CONFIG_PROC_FS struct ext4_sb_info *sbi = EXT4_SB(sb); if (sbi->s_proc == NULL) @@ -2750,7 +2742,7 @@ static int ext4_mb_destroy_per_dev_proc(struct super_block *sb) remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc); remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); - +#endif return 0; } @@ -2771,6 +2763,16 @@ int __init init_ext4_mballoc(void) kmem_cache_destroy(ext4_pspace_cachep); return -ENOMEM; } + + ext4_free_ext_cachep = + kmem_cache_create("ext4_free_block_extents", + sizeof(struct ext4_free_data), + 0, SLAB_RECLAIM_ACCOUNT, NULL); + if (ext4_free_ext_cachep == NULL) { + kmem_cache_destroy(ext4_pspace_cachep); + kmem_cache_destroy(ext4_ac_cachep); + return -ENOMEM; + } return 0; } @@ -2779,6 +2781,7 @@ void exit_ext4_mballoc(void) /* XXX: synchronize_rcu(); */ kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); + kmem_cache_destroy(ext4_free_ext_cachep); } @@ -4324,8 +4327,6 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, goto out1; } - ext4_mb_poll_new_transaction(sb, handle); - *errp = ext4_mb_initialize_context(ac, ar); if (*errp) { ar->len = 0; @@ -4384,35 +4385,20 @@ out1: return block; } -static void ext4_mb_poll_new_transaction(struct super_block *sb, - handle_t *handle) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - - if (sbi->s_last_transaction == handle->h_transaction->t_tid) - return; - - /* new transaction! time to close last one and free blocks for - * committed transaction. we know that only transaction can be - * active, so previos transaction can be being logged and we - * know that transaction before previous is known to be already - * logged. this means that now we may free blocks freed in all - * transactions before previous one. hope I'm clear enough ... */ - spin_lock(&sbi->s_md_lock); - if (sbi->s_last_transaction != handle->h_transaction->t_tid) { - mb_debug("new transaction %lu, old %lu\n", - (unsigned long) handle->h_transaction->t_tid, - (unsigned long) sbi->s_last_transaction); - list_splice_init(&sbi->s_closed_transaction, - &sbi->s_committed_transaction); - list_splice_init(&sbi->s_active_transaction, - &sbi->s_closed_transaction); - sbi->s_last_transaction = handle->h_transaction->t_tid; - } - spin_unlock(&sbi->s_md_lock); - - ext4_mb_free_committed_blocks(sb); +/* + * We can merge two free data extents only if the physical blocks + * are contiguous, AND the extents were freed by the same transaction, + * AND the blocks are associated with the same group. + */ +static int can_merge(struct ext4_free_data *entry1, + struct ext4_free_data *entry2) +{ + if ((entry1->t_tid == entry2->t_tid) && + (entry1->group == entry2->group) && + ((entry1->start_blk + entry1->count) == entry2->start_blk)) + return 1; + return 0; } static noinline_for_stack int @@ -4422,57 +4408,80 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, struct ext4_group_info *db = e4b->bd_info; struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_free_metadata *md; - int i; + struct ext4_free_data *entry, *new_entry; + struct rb_node **n = &db->bb_free_root.rb_node, *node; + struct rb_node *parent = NULL, *new_node; + BUG_ON(e4b->bd_bitmap_page == NULL); BUG_ON(e4b->bd_buddy_page == NULL); + new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); + new_entry->start_blk = block; + new_entry->group = group; + new_entry->count = count; + new_entry->t_tid = handle->h_transaction->t_tid; + new_node = &new_entry->node; + ext4_lock_group(sb, group); - for (i = 0; i < count; i++) { - md = db->bb_md_cur; - if (md && db->bb_tid != handle->h_transaction->t_tid) { - db->bb_md_cur = NULL; - md = NULL; + if (!*n) { + /* first free block exent. We need to + protect buddy cache from being freed, + * otherwise we'll refresh it from + * on-disk bitmap and lose not-yet-available + * blocks */ + page_cache_get(e4b->bd_buddy_page); + page_cache_get(e4b->bd_bitmap_page); + } + while (*n) { + parent = *n; + entry = rb_entry(parent, struct ext4_free_data, node); + if (block < entry->start_blk) + n = &(*n)->rb_left; + else if (block >= (entry->start_blk + entry->count)) + n = &(*n)->rb_right; + else { + ext4_error(sb, __func__, + "Double free of blocks %d (%d %d)\n", + block, entry->start_blk, entry->count); + return 0; } + } - if (md == NULL) { - ext4_unlock_group(sb, group); - md = kmalloc(sizeof(*md), GFP_NOFS); - if (md == NULL) - return -ENOMEM; - md->num = 0; - md->group = group; - - ext4_lock_group(sb, group); - if (db->bb_md_cur == NULL) { - spin_lock(&sbi->s_md_lock); - list_add(&md->list, &sbi->s_active_transaction); - spin_unlock(&sbi->s_md_lock); - /* protect buddy cache from being freed, - * otherwise we'll refresh it from - * on-disk bitmap and lose not-yet-available - * blocks */ - page_cache_get(e4b->bd_buddy_page); - page_cache_get(e4b->bd_bitmap_page); - db->bb_md_cur = md; - db->bb_tid = handle->h_transaction->t_tid; - mb_debug("new md 0x%p for group %lu\n", - md, md->group); - } else { - kfree(md); - md = db->bb_md_cur; - } + rb_link_node(new_node, parent, n); + rb_insert_color(new_node, &db->bb_free_root); + + /* Now try to see the extent can be merged to left and right */ + node = rb_prev(new_node); + if (node) { + entry = rb_entry(node, struct ext4_free_data, node); + if (can_merge(entry, new_entry)) { + new_entry->start_blk = entry->start_blk; + new_entry->count += entry->count; + rb_erase(node, &(db->bb_free_root)); + spin_lock(&sbi->s_md_lock); + list_del(&entry->list); + spin_unlock(&sbi->s_md_lock); + kmem_cache_free(ext4_free_ext_cachep, entry); } + } - BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS); - md->blocks[md->num] = block + i; - md->num++; - if (md->num == EXT4_BB_MAX_BLOCKS) { - /* no more space, put full container on a sb's list */ - db->bb_md_cur = NULL; + node = rb_next(new_node); + if (node) { + entry = rb_entry(node, struct ext4_free_data, node); + if (can_merge(new_entry, entry)) { + new_entry->count += entry->count; + rb_erase(node, &(db->bb_free_root)); + spin_lock(&sbi->s_md_lock); + list_del(&entry->list); + spin_unlock(&sbi->s_md_lock); + kmem_cache_free(ext4_free_ext_cachep, entry); } } + /* Add the extent to transaction's private list */ + spin_lock(&sbi->s_md_lock); + list_add(&new_entry->list, &handle->h_transaction->t_private_list); + spin_unlock(&sbi->s_md_lock); ext4_unlock_group(sb, group); return 0; } @@ -4500,8 +4509,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, *freed = 0; - ext4_mb_poll_new_transaction(sb, handle); - sbi = EXT4_SB(sb); es = EXT4_SB(sb)->s_es; if (block < le32_to_cpu(es->s_first_data_block) || diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index b3b4828f8b89..b5dff1fff1e5 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -18,6 +18,8 @@ #include <linux/pagemap.h> #include <linux/seq_file.h> #include <linux/version.h> +#include <linux/blkdev.h> +#include <linux/marker.h> #include "ext4_jbd2.h" #include "ext4.h" #include "group.h" @@ -98,23 +100,29 @@ static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; +static struct kmem_cache *ext4_free_ext_cachep; -#ifdef EXT4_BB_MAX_BLOCKS -#undef EXT4_BB_MAX_BLOCKS -#endif -#define EXT4_BB_MAX_BLOCKS 30 +struct ext4_free_data { + /* this links the free block information from group_info */ + struct rb_node node; -struct ext4_free_metadata { - ext4_group_t group; - unsigned short num; - ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS]; + /* this links the free block information from ext4_sb_info */ struct list_head list; + + /* group which free block extent belongs */ + ext4_group_t group; + + /* free block extent */ + ext4_grpblk_t start_blk; + ext4_grpblk_t count; + + /* transaction which freed this extent */ + tid_t t_tid; }; struct ext4_group_info { unsigned long bb_state; - unsigned long bb_tid; - struct ext4_free_metadata *bb_md_cur; + struct rb_root bb_free_root; unsigned short bb_first_free; unsigned short bb_free; unsigned short bb_fragments; @@ -261,8 +269,6 @@ struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); -static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *); -static void ext4_mb_free_committed_blocks(struct super_block *); static void ext4_mb_return_to_preallocation(struct inode *inode, struct ext4_buddy *e4b, sector_t block, int count); @@ -270,6 +276,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *, struct super_block *, struct ext4_prealloc_space *pa); static int ext4_mb_init_per_dev_proc(struct super_block *sb); static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); +static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dea8f13c2fd9..9b2b2bc4ec17 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -374,66 +374,6 @@ void ext4_update_dynamic_rev(struct super_block *sb) */ } -int ext4_update_compat_feature(handle_t *handle, - struct super_block *sb, __u32 compat) -{ - int err = 0; - if (!EXT4_HAS_COMPAT_FEATURE(sb, compat)) { - err = ext4_journal_get_write_access(handle, - EXT4_SB(sb)->s_sbh); - if (err) - return err; - EXT4_SET_COMPAT_FEATURE(sb, compat); - sb->s_dirt = 1; - handle->h_sync = 1; - BUFFER_TRACE(EXT4_SB(sb)->s_sbh, - "call ext4_journal_dirty_met adata"); - err = ext4_journal_dirty_metadata(handle, - EXT4_SB(sb)->s_sbh); - } - return err; -} - -int ext4_update_rocompat_feature(handle_t *handle, - struct super_block *sb, __u32 rocompat) -{ - int err = 0; - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, rocompat)) { - err = ext4_journal_get_write_access(handle, - EXT4_SB(sb)->s_sbh); - if (err) - return err; - EXT4_SET_RO_COMPAT_FEATURE(sb, rocompat); - sb->s_dirt = 1; - handle->h_sync = 1; - BUFFER_TRACE(EXT4_SB(sb)->s_sbh, - "call ext4_journal_dirty_met adata"); - err = ext4_journal_dirty_metadata(handle, - EXT4_SB(sb)->s_sbh); - } - return err; -} - -int ext4_update_incompat_feature(handle_t *handle, - struct super_block *sb, __u32 incompat) -{ - int err = 0; - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, incompat)) { - err = ext4_journal_get_write_access(handle, - EXT4_SB(sb)->s_sbh); - if (err) - return err; - EXT4_SET_INCOMPAT_FEATURE(sb, incompat); - sb->s_dirt = 1; - handle->h_sync = 1; - BUFFER_TRACE(EXT4_SB(sb)->s_sbh, - "call ext4_journal_dirty_met adata"); - err = ext4_journal_dirty_metadata(handle, - EXT4_SB(sb)->s_sbh); - } - return err; -} - /* * Open the external journal device */ @@ -904,7 +844,7 @@ static const struct export_operations ext4_export_ops = { enum { Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, - Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, + Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, @@ -915,7 +855,7 @@ enum { Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, - Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc, + Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_inode_readahead_blks }; @@ -933,8 +873,6 @@ static const match_table_t tokens = { {Opt_err_panic, "errors=panic"}, {Opt_err_ro, "errors=remount-ro"}, {Opt_nouid32, "nouid32"}, - {Opt_nocheck, "nocheck"}, - {Opt_nocheck, "check=none"}, {Opt_debug, "debug"}, {Opt_oldalloc, "oldalloc"}, {Opt_orlov, "orlov"}, @@ -973,8 +911,6 @@ static const match_table_t tokens = { {Opt_extents, "extents"}, {Opt_noextents, "noextents"}, {Opt_i_version, "i_version"}, - {Opt_mballoc, "mballoc"}, - {Opt_nomballoc, "nomballoc"}, {Opt_stripe, "stripe=%u"}, {Opt_resize, "resize"}, {Opt_delalloc, "delalloc"}, @@ -1073,9 +1009,6 @@ static int parse_options(char *options, struct super_block *sb, case Opt_nouid32: set_opt(sbi->s_mount_opt, NO_UID32); break; - case Opt_nocheck: - clear_opt(sbi->s_mount_opt, CHECK); - break; case Opt_debug: set_opt(sbi->s_mount_opt, DEBUG); break; @@ -1618,14 +1551,14 @@ static int ext4_check_descriptors(struct super_block *sb) if (block_bitmap < first_block || block_bitmap > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " "Block bitmap for group %lu not in group " - "(block %llu)!", i, block_bitmap); + "(block %llu)!\n", i, block_bitmap); return 0; } inode_bitmap = ext4_inode_bitmap(sb, gdp); if (inode_bitmap < first_block || inode_bitmap > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " "Inode bitmap for group %lu not in group " - "(block %llu)!", i, inode_bitmap); + "(block %llu)!\n", i, inode_bitmap); return 0; } inode_table = ext4_inode_table(sb, gdp); @@ -1633,7 +1566,7 @@ static int ext4_check_descriptors(struct super_block *sb) inode_table + sbi->s_itb_per_group - 1 > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " "Inode table for group %lu not in group " - "(block %llu)!", i, inode_table); + "(block %llu)!\n", i, inode_table); return 0; } spin_lock(sb_bgl_lock(sbi, i)); @@ -1778,13 +1711,13 @@ static void ext4_orphan_cleanup(struct super_block *sb, * * Note, this does *not* consider any metadata overhead for vfs i_blocks. */ -static loff_t ext4_max_size(int blkbits) +static loff_t ext4_max_size(int blkbits, int has_huge_files) { loff_t res; loff_t upper_limit = MAX_LFS_FILESIZE; /* small i_blocks in vfs inode? */ - if (sizeof(blkcnt_t) < sizeof(u64)) { + if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { /* * CONFIG_LSF is not enabled implies the inode * i_block represent total blocks in 512 bytes @@ -1814,7 +1747,7 @@ static loff_t ext4_max_size(int blkbits) * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks. * We need to be 1 filesystem block less than the 2^48 sector limit. */ -static loff_t ext4_max_bitmap_size(int bits) +static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) { loff_t res = EXT4_NDIR_BLOCKS; int meta_blocks; @@ -1827,11 +1760,11 @@ static loff_t ext4_max_bitmap_size(int bits) * total number of 512 bytes blocks of the file */ - if (sizeof(blkcnt_t) < sizeof(u64)) { + if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { /* - * CONFIG_LSF is not enabled implies the inode - * i_block represent total blocks in 512 bytes - * 32 == size of vfs inode i_blocks * 8 + * !has_huge_files or CONFIG_LSF is not enabled + * implies the inode i_block represent total blocks in + * 512 bytes 32 == size of vfs inode i_blocks * 8 */ upper_limit = (1LL << 32) - 1; @@ -1940,7 +1873,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) int blocksize; int db_count; int i; - int needs_recovery; + int needs_recovery, has_huge_files; __le32 features; __u64 blocks_count; int err; @@ -2081,7 +2014,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_id, le32_to_cpu(features)); goto failed_mount; } - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { + has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_HUGE_FILE); + if (has_huge_files) { /* * Large file size enabled file system can only be * mount if kernel is build with CONFIG_LSF @@ -2131,8 +2066,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } } - sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits); - sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits); + sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, + has_huge_files); + sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; @@ -2456,6 +2392,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "available.\n"); } + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - " + "requested data journaling mode\n"); + clear_opt(sbi->s_mount_opt, DELALLOC); + } else if (test_opt(sb, DELALLOC)) + printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); + + ext4_ext_init(sb); + err = ext4_mb_init(sb, needs_recovery); + if (err) { + printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n", + err); + goto failed_mount4; + } + /* * akpm: core read_super() calls in here with the superblock locked. * That deadlocks, because orphan cleanup needs to lock the superblock @@ -2475,21 +2426,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": "writeback"); - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { - printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - " - "requested data journaling mode\n"); - clear_opt(sbi->s_mount_opt, DELALLOC); - } else if (test_opt(sb, DELALLOC)) - printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); - - ext4_ext_init(sb); - err = ext4_mb_init(sb, needs_recovery); - if (err) { - printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n", - err); - goto failed_mount4; - } - lock_kernel(); return 0; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 25adfc3c693a..d0ff0b8cf309 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -8,7 +8,7 @@ * pages against inodes. ie: data writeback. Writeout of the * inode itself is not handled here. * - * 10Apr2002 akpm@zip.com.au + * 10Apr2002 Andrew Morton * Split out of fs/inode.c * Additions for address_space-based writeback */ diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index ba851576ebb1..6d98f116ca03 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -190,6 +190,10 @@ int hfs_cat_find_brec(struct super_block *sb, u32 cnid, fd->search_key->cat.ParID = rec.thread.ParID; len = fd->search_key->cat.CName.len = rec.thread.CName.len; + if (len > HFS_NAMELEN) { + printk(KERN_ERR "hfs: bad catalog namelength\n"); + return -EIO; + } memcpy(fd->search_key->cat.CName.name, rec.thread.CName.name, len); return hfs_brec_find(fd); } diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c index d128a25b74d2..ea30afc2a03c 100644 --- a/fs/hfsplus/bitmap.c +++ b/fs/hfsplus/bitmap.c @@ -32,6 +32,10 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex); mapping = HFSPLUS_SB(sb).alloc_file->i_mapping; page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); + if (IS_ERR(page)) { + start = size; + goto out; + } pptr = kmap(page); curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32; i = offset % 32; @@ -73,6 +77,10 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma break; page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); + if (IS_ERR(page)) { + start = size; + goto out; + } curr = pptr = kmap(page); if ((size ^ offset) / PAGE_CACHE_BITS) end = pptr + PAGE_CACHE_BITS / 32; @@ -120,6 +128,10 @@ found: offset += PAGE_CACHE_BITS; page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); + if (IS_ERR(page)) { + start = size; + goto out; + } pptr = kmap(page); curr = pptr; end = pptr + PAGE_CACHE_BITS / 32; diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index ba117c445e78..f6874acb2cf2 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -168,6 +168,11 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid, return -EIO; } + if (be16_to_cpu(tmp.thread.nodeName.length) > 255) { + printk(KERN_ERR "hfs: catalog name length corrupted\n"); + return -EIO; + } + hfsplus_cat_build_key_uni(fd->search_key, be32_to_cpu(tmp.thread.parentID), &tmp.thread.nodeName); return hfs_brec_find(fd); diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index fec8f61227ff..0022eec63cda 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -199,6 +199,9 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock, goto done; } + if (inode->i_ino == HFSPLUS_EXT_CNID) + return -EIO; + mutex_lock(&HFSPLUS_I(inode).extents_lock); res = hfsplus_ext_read_extent(inode, ablock); if (!res) { diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index b085d64a2b67..963be644297a 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -254,6 +254,8 @@ static int hfsplus_file_open(struct inode *inode, struct file *file) { if (HFSPLUS_IS_RSRC(inode)) inode = HFSPLUS_I(inode).rsrc_inode; + if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) + return -EOVERFLOW; atomic_inc(&HFSPLUS_I(inode).opencnt); return 0; } diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index e834e578c93f..eb74531a0a8e 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -356,7 +356,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n"); sb->s_flags |= MS_RDONLY; - } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) { + } else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) && !(sb->s_flags & MS_RDONLY)) { printk(KERN_WARNING "hfs: write access to a journaled filesystem is not supported, " "use the force option at your own risk, mounting read-only.\n"); sb->s_flags |= MS_RDONLY; diff --git a/fs/jbd/Kconfig b/fs/jbd/Kconfig new file mode 100644 index 000000000000..4e28beeed157 --- /dev/null +++ b/fs/jbd/Kconfig @@ -0,0 +1,30 @@ +config JBD + tristate + help + This is a generic journalling layer for block devices. It is + currently used by the ext3 file system, but it could also be + used to add journal support to other file systems or block + devices such as RAID or LVM. + + If you are using the ext3 file system, you need to say Y here. + If you are not using ext3 then you will probably want to say N. + + To compile this device as a module, choose M here: the module will be + called jbd. If you are compiling ext3 into the kernel, you + cannot compile this code as a module. + +config JBD_DEBUG + bool "JBD (ext3) debugging support" + depends on JBD && DEBUG_FS + help + If you are using the ext3 journaled file system (or potentially any + other file system/device using JBD), this option allows you to + enable debugging output while the system is running, in order to + help track down any problems you are having. By default the + debugging output will be turned off. + + If you select Y here, then you will be able to turn on debugging + with "echo N > /sys/kernel/debug/jbd/jbd-debug", where N is a + number between 1 and 5, the higher the number, the more debugging + output is generated. To turn debugging off again, do + "echo 0 > /sys/kernel/debug/jbd/jbd-debug". diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index ae08c057e751..25719d902c51 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -482,6 +482,8 @@ void journal_commit_transaction(journal_t *journal) printk(KERN_WARNING "JBD: Detected IO errors while flushing file data " "on %s\n", bdevname(journal->j_fs_dev, b)); + if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR) + journal_abort(journal, err); err = 0; } @@ -518,9 +520,10 @@ void journal_commit_transaction(journal_t *journal) jh = commit_transaction->t_buffers; /* If we're in abort mode, we just un-journal the buffer and - release it for background writing. */ + release it. */ if (is_journal_aborted(journal)) { + clear_buffer_jbddirty(jh2bh(jh)); JBUFFER_TRACE(jh, "journal is aborting: refile"); journal_refile_buffer(journal, jh); /* If that was the last one, we need to clean up @@ -762,6 +765,9 @@ wait_for_iobuf: /* AKPM: bforget here */ } + if (err) + journal_abort(journal, err); + jbd_debug(3, "JBD: commit phase 6\n"); if (journal_write_commit_record(journal, commit_transaction)) @@ -852,6 +858,8 @@ restart_loop: if (buffer_jbddirty(bh)) { JBUFFER_TRACE(jh, "add to new checkpointing trans"); __journal_insert_checkpoint(jh, commit_transaction); + if (is_journal_aborted(journal)) + clear_buffer_jbddirty(bh); JBUFFER_TRACE(jh, "refile for checkpoint writeback"); __journal_refile_buffer(jh); jbd_unlock_bh_state(bh); diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 0540ca27a446..d15cd6e7251e 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -954,9 +954,10 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) journal_t *journal = handle->h_transaction->t_journal; int need_brelse = 0; struct journal_head *jh; + int ret = 0; if (is_handle_aborted(handle)) - return 0; + return ret; jh = journal_add_journal_head(bh); JBUFFER_TRACE(jh, "entry"); @@ -1067,7 +1068,16 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) time if it is redirtied */ } - /* journal_clean_data_list() may have got there first */ + /* + * We cannot remove the buffer with io error from the + * committing transaction, because otherwise it would + * miss the error and the commit would not abort. + */ + if (unlikely(!buffer_uptodate(bh))) { + ret = -EIO; + goto no_journal; + } + if (jh->b_transaction != NULL) { JBUFFER_TRACE(jh, "unfile from commit"); __journal_temp_unlink_buffer(jh); @@ -1108,7 +1118,7 @@ no_journal: } JBUFFER_TRACE(jh, "exit"); journal_put_journal_head(jh); - return 0; + return ret; } /** diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig new file mode 100644 index 000000000000..f32f346f4b0a --- /dev/null +++ b/fs/jbd2/Kconfig @@ -0,0 +1,33 @@ +config JBD2 + tristate + select CRC32 + help + This is a generic journaling layer for block devices that support + both 32-bit and 64-bit block numbers. It is currently used by + the ext4 and OCFS2 filesystems, but it could also be used to add + journal support to other file systems or block devices such + as RAID or LVM. + + If you are using ext4 or OCFS2, you need to say Y here. + If you are not using ext4 or OCFS2 then you will + probably want to say N. + + To compile this device as a module, choose M here. The module will be + called jbd2. If you are compiling ext4 or OCFS2 into the kernel, + you cannot compile this code as a module. + +config JBD2_DEBUG + bool "JBD2 (ext4) debugging support" + depends on JBD2 && DEBUG_FS + help + If you are using the ext4 journaled file system (or + potentially any other filesystem/device using JBD2), this option + allows you to enable debugging output while the system is running, + in order to help track down any problems you are having. + By default, the debugging output will be turned off. + + If you select Y here, then you will be able to turn on debugging + with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a + number between 1 and 5. The higher the number, the more debugging + output is generated. To turn debugging off again, do + "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug". diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 0abe02c4242a..8b119e16aa36 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -995,6 +995,9 @@ restart_loop: } spin_unlock(&journal->j_list_lock); + if (journal->j_commit_callback) + journal->j_commit_callback(journal, commit_transaction); + trace_mark(jbd2_end_commit, "dev %s transaction %d head %d", journal->j_devname, commit_transaction->t_tid, journal->j_tail_sequence); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e5d540588fa9..39b7805a599a 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -52,6 +52,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) transaction->t_expires = jiffies + journal->j_commit_interval; spin_lock_init(&transaction->t_handle_lock); INIT_LIST_HEAD(&transaction->t_inode_list); + INIT_LIST_HEAD(&transaction->t_private_list); /* Set up the commit timer for the new transaction. */ journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); diff --git a/fs/jffs2/Kconfig b/fs/jffs2/Kconfig new file mode 100644 index 000000000000..6ae169cd8faa --- /dev/null +++ b/fs/jffs2/Kconfig @@ -0,0 +1,188 @@ +config JFFS2_FS + tristate "Journalling Flash File System v2 (JFFS2) support" + select CRC32 + depends on MTD + help + JFFS2 is the second generation of the Journalling Flash File System + for use on diskless embedded devices. It provides improved wear + levelling, compression and support for hard links. You cannot use + this on normal block devices, only on 'MTD' devices. + + Further information on the design and implementation of JFFS2 is + available at <http://sources.redhat.com/jffs2/>. + +config JFFS2_FS_DEBUG + int "JFFS2 debugging verbosity (0 = quiet, 2 = noisy)" + depends on JFFS2_FS + default "0" + help + This controls the amount of debugging messages produced by the JFFS2 + code. Set it to zero for use in production systems. For evaluation, + testing and debugging, it's advisable to set it to one. This will + enable a few assertions and will print debugging messages at the + KERN_DEBUG loglevel, where they won't normally be visible. Level 2 + is unlikely to be useful - it enables extra debugging in certain + areas which at one point needed debugging, but when the bugs were + located and fixed, the detailed messages were relegated to level 2. + + If reporting bugs, please try to have available a full dump of the + messages at debug level 1 while the misbehaviour was occurring. + +config JFFS2_FS_WRITEBUFFER + bool "JFFS2 write-buffering support" + depends on JFFS2_FS + default y + help + This enables the write-buffering support in JFFS2. + + This functionality is required to support JFFS2 on the following + types of flash devices: + - NAND flash + - NOR flash with transparent ECC + - DataFlash + +config JFFS2_FS_WBUF_VERIFY + bool "Verify JFFS2 write-buffer reads" + depends on JFFS2_FS_WRITEBUFFER + default n + help + This causes JFFS2 to read back every page written through the + write-buffer, and check for errors. + +config JFFS2_SUMMARY + bool "JFFS2 summary support (EXPERIMENTAL)" + depends on JFFS2_FS && EXPERIMENTAL + default n + help + This feature makes it possible to use summary information + for faster filesystem mount. + + The summary information can be inserted into a filesystem image + by the utility 'sumtool'. + + If unsure, say 'N'. + +config JFFS2_FS_XATTR + bool "JFFS2 XATTR support (EXPERIMENTAL)" + depends on JFFS2_FS && EXPERIMENTAL + default n + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + <http://acl.bestbits.at/> for details). + + If unsure, say N. + +config JFFS2_FS_POSIX_ACL + bool "JFFS2 POSIX Access Control Lists" + depends on JFFS2_FS_XATTR + default y + select FS_POSIX_ACL + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the Posix ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N + +config JFFS2_FS_SECURITY + bool "JFFS2 Security Labels" + depends on JFFS2_FS_XATTR + default y + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the jffs2 filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. + +config JFFS2_COMPRESSION_OPTIONS + bool "Advanced compression options for JFFS2" + depends on JFFS2_FS + default n + help + Enabling this option allows you to explicitly choose which + compression modules, if any, are enabled in JFFS2. Removing + compressors can mean you cannot read existing file systems, + and enabling experimental compressors can mean that you + write a file system which cannot be read by a standard kernel. + + If unsure, you should _definitely_ say 'N'. + +config JFFS2_ZLIB + bool "JFFS2 ZLIB compression support" if JFFS2_COMPRESSION_OPTIONS + select ZLIB_INFLATE + select ZLIB_DEFLATE + depends on JFFS2_FS + default y + help + Zlib is designed to be a free, general-purpose, legally unencumbered, + lossless data-compression library for use on virtually any computer + hardware and operating system. See <http://www.gzip.org/zlib/> for + further information. + + Say 'Y' if unsure. + +config JFFS2_LZO + bool "JFFS2 LZO compression support" if JFFS2_COMPRESSION_OPTIONS + select LZO_COMPRESS + select LZO_DECOMPRESS + depends on JFFS2_FS + default n + help + minilzo-based compression. Generally works better than Zlib. + + This feature was added in July, 2007. Say 'N' if you need + compatibility with older bootloaders or kernels. + +config JFFS2_RTIME + bool "JFFS2 RTIME compression support" if JFFS2_COMPRESSION_OPTIONS + depends on JFFS2_FS + default y + help + Rtime does manage to recompress already-compressed data. Say 'Y' if unsure. + +config JFFS2_RUBIN + bool "JFFS2 RUBIN compression support" if JFFS2_COMPRESSION_OPTIONS + depends on JFFS2_FS + default n + help + RUBINMIPS and DYNRUBIN compressors. Say 'N' if unsure. + +choice + prompt "JFFS2 default compression mode" if JFFS2_COMPRESSION_OPTIONS + default JFFS2_CMODE_PRIORITY + depends on JFFS2_FS + help + You can set here the default compression mode of JFFS2 from + the available compression modes. Don't touch if unsure. + +config JFFS2_CMODE_NONE + bool "no compression" + help + Uses no compression. + +config JFFS2_CMODE_PRIORITY + bool "priority" + help + Tries the compressors in a predefined order and chooses the first + successful one. + +config JFFS2_CMODE_SIZE + bool "size (EXPERIMENTAL)" + help + Tries all compressors and chooses the one which has the smallest + result. + +config JFFS2_CMODE_FAVOURLZO + bool "Favour LZO" + help + Tries all compressors and chooses the one which has the smallest + result but gives some preference to LZO (which has faster + decompression) at the expense of size. + +endchoice diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c index 86739ee53b37..f25e70c1b51c 100644 --- a/fs/jffs2/compr.c +++ b/fs/jffs2/compr.c @@ -53,8 +53,8 @@ static int jffs2_is_best_compression(struct jffs2_compressor *this, } /* jffs2_compress: - * @data: Pointer to uncompressed data - * @cdata: Pointer to returned pointer to buffer for compressed data + * @data_in: Pointer to uncompressed data + * @cpage_out: Pointer to returned pointer to buffer for compressed data * @datalen: On entry, holds the amount of data available for compression. * On exit, expected to hold the amount of data actually compressed. * @cdatalen: On entry, holds the amount of space available for compressed diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index cd219ef55254..b1aaae823a52 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -311,7 +311,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char /* FIXME: If you care. We'd need to use frags for the target if it grows much more than this */ if (targetlen > 254) - return -EINVAL; + return -ENAMETOOLONG; ri = jffs2_alloc_raw_inode(); diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index dddb2a6c9e2c..259461b910af 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -68,7 +68,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c, instr->len = c->sector_size; instr->callback = jffs2_erase_callback; instr->priv = (unsigned long)(&instr[1]); - instr->fail_addr = 0xffffffff; + instr->fail_addr = MTD_FAIL_ADDR_UNKNOWN; ((struct erase_priv_struct *)instr->priv)->jeb = jeb; ((struct erase_priv_struct *)instr->priv)->c = c; @@ -175,7 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock { /* For NAND, if the failure did not occur at the device level for a specific physical page, don't bother updating the bad block table. */ - if (jffs2_cleanmarker_oob(c) && (bad_offset != 0xffffffff)) { + if (jffs2_cleanmarker_oob(c) && (bad_offset != MTD_FAIL_ADDR_UNKNOWN)) { /* We had a device-level failure to erase. Let's see if we've failed too many times. */ if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) { diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 086c43830221..249305d65d5b 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -207,6 +207,8 @@ int jffs2_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = 0; buf->f_ffree = 0; buf->f_namelen = JFFS2_MAX_NAME_LEN; + buf->f_fsid.val[0] = JFFS2_SUPER_MAGIC; + buf->f_fsid.val[1] = c->mtd->index; spin_lock(&c->erase_completion_lock); avail = c->dirty_size + c->free_size; @@ -440,14 +442,14 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i memset(ri, 0, sizeof(*ri)); /* Set OS-specific defaults for new inodes */ - ri->uid = cpu_to_je16(current->fsuid); + ri->uid = cpu_to_je16(current_fsuid()); if (dir_i->i_mode & S_ISGID) { ri->gid = cpu_to_je16(dir_i->i_gid); if (S_ISDIR(mode)) mode |= S_ISGID; } else { - ri->gid = cpu_to_je16(current->fsgid); + ri->gid = cpu_to_je16(current_fsgid()); } /* POSIX ACLs have to be processed now, at least partly. diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c index a9bf9603c1ba..0875b60b4bf7 100644 --- a/fs/jffs2/nodemgmt.c +++ b/fs/jffs2/nodemgmt.c @@ -261,6 +261,10 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c) jffs2_sum_reset_collected(c->summary); /* reset collected summary */ + /* adjust write buffer offset, else we get a non contiguous write bug */ + if (!(c->wbuf_ofs % c->sector_size) && !c->wbuf_len) + c->wbuf_ofs = 0xffffffff; + D1(printk(KERN_DEBUG "jffs2_find_nextblock(): new nextblock = 0x%08x\n", c->nextblock->offset)); return 0; diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index 0e78b00035e4..d9a721e6db70 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -679,10 +679,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad) memset(c->wbuf,0xff,c->wbuf_pagesize); /* adjust write buffer offset, else we get a non contiguous write bug */ - if (SECTOR_ADDR(c->wbuf_ofs) == SECTOR_ADDR(c->wbuf_ofs+c->wbuf_pagesize)) - c->wbuf_ofs += c->wbuf_pagesize; - else - c->wbuf_ofs = 0xffffffff; + c->wbuf_ofs += c->wbuf_pagesize; c->wbuf_len = 0; return 0; } diff --git a/fs/mpage.c b/fs/mpage.c index dbcc7af76a15..552b80b3facc 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -6,7 +6,7 @@ * Contains functions related to preparing and submitting BIOs which contain * multiple pagecache pages. * - * 15May2002 akpm@zip.com.au + * 15May2002 Andrew Morton * Initial version * 27Jun2002 axboe@suse.de * use bio_add_page() to build bio's just the right size diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 6a09760c5960..c2e9cfd9e5a4 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -40,6 +40,16 @@ unsigned short nfs_callback_tcpport; static const int nfs_set_port_min = 0; static const int nfs_set_port_max = 65535; +/* + * If the kernel has IPv6 support available, always listen for + * both AF_INET and AF_INET6 requests. + */ +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static const sa_family_t nfs_callback_family = AF_INET6; +#else +static const sa_family_t nfs_callback_family = AF_INET; +#endif + static int param_set_port(const char *val, struct kernel_param *kp) { char *endp; @@ -106,7 +116,7 @@ int nfs_callback_up(void) if (nfs_callback_info.users++ || nfs_callback_info.task != NULL) goto out; serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, - AF_INET, NULL); + nfs_callback_family, NULL); ret = -ENOMEM; if (!serv) goto out_err; @@ -116,7 +126,8 @@ int nfs_callback_up(void) if (ret <= 0) goto out_err; nfs_callback_tcpport = ret; - dprintk("Callback port = 0x%x\n", nfs_callback_tcpport); + dprintk("NFS: Callback listener port = %u (af %u)\n", + nfs_callback_tcpport, nfs_callback_family); nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]); if (IS_ERR(nfs_callback_info.rqst)) { @@ -149,8 +160,8 @@ out: mutex_unlock(&nfs_callback_mutex); return ret; out_err: - dprintk("Couldn't create callback socket or server thread; err = %d\n", - ret); + dprintk("NFS: Couldn't create callback socket or server thread; " + "err = %d\n", ret); nfs_callback_info.users--; goto out; } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 5ee23e7058b3..7547600b6174 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -675,7 +675,7 @@ static int nfs_init_server(struct nfs_server *server, server->nfs_client = clp; /* Initialise the client representation from the mount data */ - server->flags = data->flags & NFS_MOUNT_FLAGMASK; + server->flags = data->flags; if (data->rsize) server->rsize = nfs_block_size(data->rsize, NULL); @@ -850,7 +850,6 @@ static struct nfs_server *nfs_alloc_server(void) INIT_LIST_HEAD(&server->client_link); INIT_LIST_HEAD(&server->master_link); - init_waitqueue_head(&server->active_wq); atomic_set(&server->active, 0); server->io_stats = nfs_alloc_iostats(); @@ -1073,7 +1072,7 @@ static int nfs4_init_server(struct nfs_server *server, goto error; /* Initialise the client representation from the mount data */ - server->flags = data->flags & NFS_MOUNT_FLAGMASK; + server->flags = data->flags; server->caps |= NFS_CAP_ATOMIC_OPEN; if (data->rsize) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 74f92b717f78..efdba2e802d7 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -156,6 +156,7 @@ typedef struct { decode_dirent_t decode; int plus; unsigned long timestamp; + unsigned long gencount; int timestamp_valid; } nfs_readdir_descriptor_t; @@ -177,7 +178,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) struct file *file = desc->file; struct inode *inode = file->f_path.dentry->d_inode; struct rpc_cred *cred = nfs_file_cred(file); - unsigned long timestamp; + unsigned long timestamp, gencount; int error; dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n", @@ -186,6 +187,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) again: timestamp = jiffies; + gencount = nfs_inc_attr_generation_counter(); error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, desc->entry->cookie, page, NFS_SERVER(inode)->dtsize, desc->plus); if (error < 0) { @@ -199,6 +201,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) goto error; } desc->timestamp = timestamp; + desc->gencount = gencount; desc->timestamp_valid = 1; SetPageUptodate(page); /* Ensure consistent page alignment of the data. @@ -224,9 +227,10 @@ int dir_decode(nfs_readdir_descriptor_t *desc) if (IS_ERR(p)) return PTR_ERR(p); desc->ptr = p; - if (desc->timestamp_valid) + if (desc->timestamp_valid) { desc->entry->fattr->time_start = desc->timestamp; - else + desc->entry->fattr->gencount = desc->gencount; + } else desc->entry->fattr->valid &= ~NFS_ATTR_FATTR; return 0; } @@ -471,7 +475,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, struct rpc_cred *cred = nfs_file_cred(file); struct page *page = NULL; int status; - unsigned long timestamp; + unsigned long timestamp, gencount; dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", (unsigned long long)*desc->dir_cookie); @@ -482,6 +486,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, goto out; } timestamp = jiffies; + gencount = nfs_inc_attr_generation_counter(); status = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, *desc->dir_cookie, page, NFS_SERVER(inode)->dtsize, @@ -490,6 +495,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */ if (status >= 0) { desc->timestamp = timestamp; + desc->gencount = gencount; desc->timestamp_valid = 1; if ((status = dir_decode(desc)) == 0) desc->entry->prev_cookie = *desc->dir_cookie; @@ -655,7 +661,7 @@ static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync) */ void nfs_force_lookup_revalidate(struct inode *dir) { - NFS_I(dir)->cache_change_attribute = jiffies; + NFS_I(dir)->cache_change_attribute++; } /* @@ -667,6 +673,8 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry) { if (IS_ROOT(dentry)) return 1; + if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) + return 0; if (!nfs_verify_change_attribute(dir, dentry->d_time)) return 0; /* Revalidate nfsi->cache_change_attribute before we declare a match */ @@ -750,6 +758,8 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, /* Don't revalidate a negative dentry if we're creating a new file */ if (nd != NULL && nfs_lookup_check_intent(nd, LOOKUP_CREATE) != 0) return 0; + if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) + return 1; return !nfs_check_verifier(dir, dentry); } @@ -1507,7 +1517,7 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0, GFP_KERNEL)) { pagevec_add(&lru_pvec, page); - pagevec_lru_add(&lru_pvec); + pagevec_lru_add_file(&lru_pvec); SetPageUptodate(page); unlock_page(page); } else diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 78460657f5cb..d319b49f8f06 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -188,13 +188,16 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) /* origin == SEEK_END => we must revalidate the cached file length */ if (origin == SEEK_END) { struct inode *inode = filp->f_mapping->host; + int retval = nfs_revalidate_file_size(inode, filp); if (retval < 0) return (loff_t)retval; - } - lock_kernel(); /* BKL needed? */ - loff = generic_file_llseek_unlocked(filp, offset, origin); - unlock_kernel(); + + spin_lock(&inode->i_lock); + loff = generic_file_llseek_unlocked(filp, offset, origin); + spin_unlock(&inode->i_lock); + } else + loff = generic_file_llseek_unlocked(filp, offset, origin); return loff; } @@ -699,13 +702,6 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) filp->f_path.dentry->d_name.name, fl->fl_type, fl->fl_flags); - /* - * No BSD flocks over NFS allowed. - * Note: we could try to fake a POSIX lock request here by - * using ((u32) filp | 0x80000000) or some such as the pid. - * Not sure whether that would be unique, though, or whether - * that would break in other places. - */ if (!(fl->fl_flags & FL_FLOCK)) return -ENOLCK; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 52daefa2f521..b9195c02a863 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -305,8 +305,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) init_special_inode(inode, inode->i_mode, fattr->rdev); nfsi->read_cache_jiffies = fattr->time_start; - nfsi->last_updated = now; - nfsi->cache_change_attribute = now; + nfsi->attr_gencount = fattr->gencount; inode->i_atime = fattr->atime; inode->i_mtime = fattr->mtime; inode->i_ctime = fattr->ctime; @@ -453,6 +452,7 @@ out_big: void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) { if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) { + spin_lock(&inode->i_lock); if ((attr->ia_valid & ATTR_MODE) != 0) { int mode = attr->ia_mode & S_IALLUGO; mode |= inode->i_mode & ~S_IALLUGO; @@ -462,7 +462,6 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) inode->i_uid = attr->ia_uid; if ((attr->ia_valid & ATTR_GID) != 0) inode->i_gid = attr->ia_gid; - spin_lock(&inode->i_lock); NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; spin_unlock(&inode->i_lock); } @@ -472,37 +471,6 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) } } -static int nfs_wait_schedule(void *word) -{ - if (signal_pending(current)) - return -ERESTARTSYS; - schedule(); - return 0; -} - -/* - * Wait for the inode to get unlocked. - */ -static int nfs_wait_on_inode(struct inode *inode) -{ - struct nfs_inode *nfsi = NFS_I(inode); - int error; - - error = wait_on_bit_lock(&nfsi->flags, NFS_INO_REVALIDATING, - nfs_wait_schedule, TASK_KILLABLE); - - return error; -} - -static void nfs_wake_up_inode(struct inode *inode) -{ - struct nfs_inode *nfsi = NFS_I(inode); - - clear_bit(NFS_INO_REVALIDATING, &nfsi->flags); - smp_mb__after_clear_bit(); - wake_up_bit(&nfsi->flags, NFS_INO_REVALIDATING); -} - int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = dentry->d_inode; @@ -697,20 +665,15 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode)); - nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); if (is_bad_inode(inode)) - goto out_nowait; + goto out; if (NFS_STALE(inode)) - goto out_nowait; - - status = nfs_wait_on_inode(inode); - if (status < 0) goto out; - status = -ESTALE; if (NFS_STALE(inode)) goto out; + nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr); if (status != 0) { dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", @@ -724,16 +687,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) goto out; } - spin_lock(&inode->i_lock); - status = nfs_update_inode(inode, &fattr); + status = nfs_refresh_inode(inode, &fattr); if (status) { - spin_unlock(&inode->i_lock); dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode), status); goto out; } - spin_unlock(&inode->i_lock); if (nfsi->cache_validity & NFS_INO_INVALID_ACL) nfs_zap_acl_cache(inode); @@ -743,9 +703,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) (long long)NFS_FILEID(inode)); out: - nfs_wake_up_inode(inode); - - out_nowait: return status; } @@ -908,9 +865,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat return -EIO; } - /* Do atomic weak cache consistency updates */ - nfs_wcc_update_inode(inode, fattr); - if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && nfsi->change_attr != fattr->change_attr) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; @@ -939,15 +893,81 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if (invalid != 0) nfsi->cache_validity |= invalid; - else - nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ATIME - | NFS_INO_REVAL_PAGECACHE); nfsi->read_cache_jiffies = fattr->time_start; return 0; } +static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr) +{ + return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0; +} + +static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr) +{ + return nfs_size_to_loff_t(fattr->size) > i_size_read(inode); +} + +static unsigned long nfs_attr_generation_counter; + +static unsigned long nfs_read_attr_generation_counter(void) +{ + smp_rmb(); + return nfs_attr_generation_counter; +} + +unsigned long nfs_inc_attr_generation_counter(void) +{ + unsigned long ret; + smp_rmb(); + ret = ++nfs_attr_generation_counter; + smp_wmb(); + return ret; +} + +void nfs_fattr_init(struct nfs_fattr *fattr) +{ + fattr->valid = 0; + fattr->time_start = jiffies; + fattr->gencount = nfs_inc_attr_generation_counter(); +} + +/** + * nfs_inode_attrs_need_update - check if the inode attributes need updating + * @inode - pointer to inode + * @fattr - attributes + * + * Attempt to divine whether or not an RPC call reply carrying stale + * attributes got scheduled after another call carrying updated ones. + * + * To do so, the function first assumes that a more recent ctime means + * that the attributes in fattr are newer, however it also attempt to + * catch the case where ctime either didn't change, or went backwards + * (if someone reset the clock on the server) by looking at whether + * or not this RPC call was started after the inode was last updated. + * Note also the check for wraparound of 'attr_gencount' + * + * The function returns 'true' if it thinks the attributes in 'fattr' are + * more recent than the ones cached in the inode. + * + */ +static int nfs_inode_attrs_need_update(const struct inode *inode, const struct nfs_fattr *fattr) +{ + const struct nfs_inode *nfsi = NFS_I(inode); + + return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 || + nfs_ctime_need_update(inode, fattr) || + nfs_size_need_update(inode, fattr) || + ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); +} + +static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) +{ + if (nfs_inode_attrs_need_update(inode, fattr)) + return nfs_update_inode(inode, fattr); + return nfs_check_inode_attributes(inode, fattr); +} + /** * nfs_refresh_inode - try to update the inode attribute cache * @inode - pointer to inode @@ -960,21 +980,28 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat */ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) { - struct nfs_inode *nfsi = NFS_I(inode); int status; if ((fattr->valid & NFS_ATTR_FATTR) == 0) return 0; spin_lock(&inode->i_lock); - if (time_after(fattr->time_start, nfsi->last_updated)) - status = nfs_update_inode(inode, fattr); - else - status = nfs_check_inode_attributes(inode, fattr); - + status = nfs_refresh_inode_locked(inode, fattr); spin_unlock(&inode->i_lock); return status; } +static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + if (S_ISDIR(inode->i_mode)) + nfsi->cache_validity |= NFS_INO_INVALID_DATA; + if ((fattr->valid & NFS_ATTR_FATTR) == 0) + return 0; + return nfs_refresh_inode_locked(inode, fattr); +} + /** * nfs_post_op_update_inode - try to update the inode attribute cache * @inode - pointer to inode @@ -991,14 +1018,12 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) */ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) { - struct nfs_inode *nfsi = NFS_I(inode); + int status; spin_lock(&inode->i_lock); - nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; - if (S_ISDIR(inode->i_mode)) - nfsi->cache_validity |= NFS_INO_INVALID_DATA; + status = nfs_post_op_update_inode_locked(inode, fattr); spin_unlock(&inode->i_lock); - return nfs_refresh_inode(inode, fattr); + return status; } /** @@ -1014,6 +1039,15 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) */ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr) { + int status; + + spin_lock(&inode->i_lock); + /* Don't do a WCC update if these attributes are already stale */ + if ((fattr->valid & NFS_ATTR_FATTR) == 0 || + !nfs_inode_attrs_need_update(inode, fattr)) { + fattr->valid &= ~(NFS_ATTR_WCC_V4|NFS_ATTR_WCC); + goto out_noforce; + } if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && (fattr->valid & NFS_ATTR_WCC_V4) == 0) { fattr->pre_change_attr = NFS_I(inode)->change_attr; @@ -1026,7 +1060,10 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa fattr->pre_size = i_size_read(inode); fattr->valid |= NFS_ATTR_WCC; } - return nfs_post_op_update_inode(inode, fattr); +out_noforce: + status = nfs_post_op_update_inode_locked(inode, fattr); + spin_unlock(&inode->i_lock); + return status; } /* @@ -1092,7 +1129,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } /* If ctime has changed we should definitely clear access+acl caches */ if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) - invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; } else if (nfsi->change_attr != fattr->change_attr) { dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); @@ -1126,6 +1163,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_gid != fattr->gid) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + if (inode->i_nlink != fattr->nlink) + invalid |= NFS_INO_INVALID_ATTR; + inode->i_mode = fattr->mode; inode->i_nlink = fattr->nlink; inode->i_uid = fattr->uid; @@ -1145,18 +1185,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; - nfsi->last_updated = now; + nfsi->attr_gencount = nfs_inc_attr_generation_counter(); } else { if (!time_in_range(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) { if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode)) nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; } - /* - * Avoid jiffy wraparound issues with nfsi->last_updated - */ - if (!time_in_range(nfsi->last_updated, nfsi->read_cache_jiffies, now)) - nfsi->last_updated = nfsi->read_cache_jiffies; } invalid &= ~NFS_INO_INVALID_ATTR; /* Don't invalidate the data if we were to blame */ diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 24241fcbb98d..d212ee41caf2 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -153,6 +153,7 @@ extern void nfs4_clear_inode(struct inode *); void nfs_zap_acl_cache(struct inode *inode); /* super.c */ +void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *); extern struct file_system_type nfs_xdev_fs_type; #ifdef CONFIG_NFS_V4 extern struct file_system_type nfs4_xdev_fs_type; @@ -163,8 +164,8 @@ extern struct rpc_stat nfs_rpcstat; extern int __init register_nfs_fs(void); extern void __exit unregister_nfs_fs(void); -extern void nfs_sb_active(struct nfs_server *server); -extern void nfs_sb_deactive(struct nfs_server *server); +extern void nfs_sb_active(struct super_block *sb); +extern void nfs_sb_deactive(struct super_block *sb); /* namespace.c */ extern char *nfs_path(const char *base, @@ -276,3 +277,23 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len) PAGE_SIZE - 1) >> PAGE_SHIFT; } +#define IPV6_SCOPE_DELIMITER '%' + +/* + * Set the port number in an address. Be agnostic about the address + * family. + */ +static inline void nfs_set_port(struct sockaddr *sap, unsigned short port) +{ + struct sockaddr_in *ap = (struct sockaddr_in *)sap; + struct sockaddr_in6 *ap6 = (struct sockaddr_in6 *)sap; + + switch (sap->sa_family) { + case AF_INET: + ap->sin_port = htons(port); + break; + case AF_INET6: + ap6->sin6_port = htons(port); + break; + } +} diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 779d2eb649c5..086a6830d785 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -14,6 +14,7 @@ #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/sched.h> #include <linux/nfs_fs.h> +#include "internal.h" #ifdef RPC_DEBUG # define NFSDBG_FACILITY NFSDBG_MOUNT @@ -98,7 +99,7 @@ out_call_err: out_mnt_err: dprintk("NFS: MNT server returned result %d\n", result.status); - status = -EACCES; + status = nfs_stat_to_errno(result.status); goto out; } diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 66df08dd1caf..64a288ee046d 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -105,7 +105,10 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) dprintk("--> nfs_follow_mountpoint()\n"); - BUG_ON(IS_ROOT(dentry)); + err = -ESTALE; + if (IS_ROOT(dentry)) + goto out_err; + dprintk("%s: enter\n", __func__); dput(nd->path.dentry); nd->path.dentry = dget(dentry); @@ -189,7 +192,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, struct nfs_clone_mount *mountdata) { #ifdef CONFIG_NFS_V4 - struct vfsmount *mnt = NULL; + struct vfsmount *mnt = ERR_PTR(-EINVAL); switch (server->nfs_client->rpc_ops->version) { case 2: case 3: diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 423842f51ac9..cef62557c87d 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -229,6 +229,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) dprintk("NFS call getacl\n"); msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL]; + nfs_fattr_init(&fattr); status = rpc_call_sync(server->client_acl, &msg, 0); dprintk("NFS reply getacl: %d\n", status); @@ -322,6 +323,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, dprintk("NFS call setacl\n"); msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL]; + nfs_fattr_init(&fattr); status = rpc_call_sync(server->client_acl, &msg, 0); nfs_access_zap_cache(inode); nfs_zap_acl_cache(inode); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 1e750e4574a9..c55be7a7679e 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -699,7 +699,7 @@ nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, } static int -nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, +do_proc_fsinfo(struct rpc_clnt *client, struct nfs_fh *fhandle, struct nfs_fsinfo *info) { struct rpc_message msg = { @@ -711,11 +711,27 @@ nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, dprintk("NFS call fsinfo\n"); nfs_fattr_init(info->fattr); - status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); + status = rpc_call_sync(client, &msg, 0); dprintk("NFS reply fsinfo: %d\n", status); return status; } +/* + * Bare-bones access to fsinfo: this is for nfs_get_root/nfs_get_sb via + * nfs_create_server + */ +static int +nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + int status; + + status = do_proc_fsinfo(server->client, fhandle, info); + if (status && server->nfs_client->cl_rpcclient != server->client) + status = do_proc_fsinfo(server->nfs_client->cl_rpcclient, fhandle, info); + return status; +} + static int nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_pathconf *info) diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index b112857301f7..30befc39b3c6 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -93,21 +93,52 @@ static int nfs4_validate_fspath(const struct vfsmount *mnt_parent, return 0; } -/* - * Check if the string represents a "valid" IPv4 address - */ -static inline int valid_ipaddr4(const char *buf) +static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, + char *page, char *page2, + const struct nfs4_fs_location *location) { - int rc, count, in[4]; - - rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]); - if (rc != 4) - return -EINVAL; - for (count = 0; count < 4; count++) { - if (in[count] > 255) - return -EINVAL; + struct vfsmount *mnt = ERR_PTR(-ENOENT); + char *mnt_path; + int page2len; + unsigned int s; + + mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); + if (IS_ERR(mnt_path)) + return mnt; + mountdata->mnt_path = mnt_path; + page2 += strlen(mnt_path) + 1; + page2len = PAGE_SIZE - strlen(mnt_path) - 1; + + for (s = 0; s < location->nservers; s++) { + const struct nfs4_string *buf = &location->servers[s]; + struct sockaddr_storage addr; + + if (buf->len <= 0 || buf->len >= PAGE_SIZE) + continue; + + mountdata->addr = (struct sockaddr *)&addr; + + if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len)) + continue; + nfs_parse_ip_address(buf->data, buf->len, + mountdata->addr, &mountdata->addrlen); + if (mountdata->addr->sa_family == AF_UNSPEC) + continue; + nfs_set_port(mountdata->addr, NFS_PORT); + + strncpy(page2, buf->data, page2len); + page2[page2len] = '\0'; + mountdata->hostname = page2; + + snprintf(page, PAGE_SIZE, "%s:%s", + mountdata->hostname, + mountdata->mnt_path); + + mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, mountdata); + if (!IS_ERR(mnt)) + break; } - return 0; + return mnt; } /** @@ -128,7 +159,6 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent, .authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor, }; char *page = NULL, *page2 = NULL; - unsigned int s; int loc, error; if (locations == NULL || locations->nlocations <= 0) @@ -152,53 +182,16 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent, goto out; } - loc = 0; - while (loc < locations->nlocations && IS_ERR(mnt)) { + for (loc = 0; loc < locations->nlocations; loc++) { const struct nfs4_fs_location *location = &locations->locations[loc]; - char *mnt_path; if (location == NULL || location->nservers <= 0 || - location->rootpath.ncomponents == 0) { - loc++; + location->rootpath.ncomponents == 0) continue; - } - mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); - if (IS_ERR(mnt_path)) { - loc++; - continue; - } - mountdata.mnt_path = mnt_path; - - s = 0; - while (s < location->nservers) { - struct sockaddr_in addr = { - .sin_family = AF_INET, - .sin_port = htons(NFS_PORT), - }; - - if (location->servers[s].len <= 0 || - valid_ipaddr4(location->servers[s].data) < 0) { - s++; - continue; - } - - mountdata.hostname = location->servers[s].data; - addr.sin_addr.s_addr = in_aton(mountdata.hostname), - mountdata.addr = (struct sockaddr *)&addr; - mountdata.addrlen = sizeof(addr); - - snprintf(page, PAGE_SIZE, "%s:%s", - mountdata.hostname, - mountdata.mnt_path); - - mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, &mountdata); - if (!IS_ERR(mnt)) { - break; - } - s++; - } - loc++; + mnt = try_location(&mountdata, page, page2, location); + if (!IS_ERR(mnt)) + break; } out: diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c910413eaeca..83e700a2b0c0 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1659,8 +1659,10 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, struct nfs_open_context *ctx; ctx = nfs_file_open_context(sattr->ia_file); - cred = ctx->cred; - state = ctx->state; + if (ctx) { + cred = ctx->cred; + state = ctx->state; + } } status = nfs4_do_setattr(inode, cred, fattr, sattr, state); diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 4dbb84df1b68..193465210d7c 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -65,14 +65,20 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, dprintk("%s: call getattr\n", __func__); nfs_fattr_init(fattr); - status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); + status = rpc_call_sync(server->client, &msg, 0); + /* Retry with default authentication if different */ + if (status && server->nfs_client->cl_rpcclient != server->client) + status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); dprintk("%s: reply getattr: %d\n", __func__, status); if (status) return status; dprintk("%s: call statfs\n", __func__); msg.rpc_proc = &nfs_procedures[NFSPROC_STATFS]; msg.rpc_resp = &fsinfo; - status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); + status = rpc_call_sync(server->client, &msg, 0); + /* Retry with default authentication if different */ + if (status && server->nfs_client->cl_rpcclient != server->client) + status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); dprintk("%s: reply statfs: %d\n", __func__, status); if (status) return status; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index ffb697416cb1..a3b0061dfd45 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -91,6 +91,7 @@ enum { /* Mount options that take string arguments */ Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost, Opt_addr, Opt_mountaddr, Opt_clientaddr, + Opt_lookupcache, /* Special mount options */ Opt_userspace, Opt_deprecated, Opt_sloppy, @@ -154,6 +155,8 @@ static const match_table_t nfs_mount_option_tokens = { { Opt_mounthost, "mounthost=%s" }, { Opt_mountaddr, "mountaddr=%s" }, + { Opt_lookupcache, "lookupcache=%s" }, + { Opt_err, NULL } }; @@ -200,6 +203,22 @@ static const match_table_t nfs_secflavor_tokens = { { Opt_sec_err, NULL } }; +enum { + Opt_lookupcache_all, Opt_lookupcache_positive, + Opt_lookupcache_none, + + Opt_lookupcache_err +}; + +static match_table_t nfs_lookupcache_tokens = { + { Opt_lookupcache_all, "all" }, + { Opt_lookupcache_positive, "pos" }, + { Opt_lookupcache_positive, "positive" }, + { Opt_lookupcache_none, "none" }, + + { Opt_lookupcache_err, NULL } +}; + static void nfs_umount_begin(struct super_block *); static int nfs_statfs(struct dentry *, struct kstatfs *); @@ -209,7 +228,6 @@ static int nfs_get_sb(struct file_system_type *, int, const char *, void *, stru static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); static void nfs_kill_super(struct super_block *); -static void nfs_put_super(struct super_block *); static int nfs_remount(struct super_block *sb, int *flags, char *raw_data); static struct file_system_type nfs_fs_type = { @@ -232,7 +250,6 @@ static const struct super_operations nfs_sops = { .alloc_inode = nfs_alloc_inode, .destroy_inode = nfs_destroy_inode, .write_inode = nfs_write_inode, - .put_super = nfs_put_super, .statfs = nfs_statfs, .clear_inode = nfs_clear_inode, .umount_begin = nfs_umount_begin, @@ -337,26 +354,20 @@ void __exit unregister_nfs_fs(void) unregister_filesystem(&nfs_fs_type); } -void nfs_sb_active(struct nfs_server *server) +void nfs_sb_active(struct super_block *sb) { - atomic_inc(&server->active); -} + struct nfs_server *server = NFS_SB(sb); -void nfs_sb_deactive(struct nfs_server *server) -{ - if (atomic_dec_and_test(&server->active)) - wake_up(&server->active_wq); + if (atomic_inc_return(&server->active) == 1) + atomic_inc(&sb->s_active); } -static void nfs_put_super(struct super_block *sb) +void nfs_sb_deactive(struct super_block *sb) { struct nfs_server *server = NFS_SB(sb); - /* - * Make sure there are no outstanding ops to this server. - * If so, wait for them to finish before allowing the - * unmount to continue. - */ - wait_event(server->active_wq, atomic_read(&server->active) == 0); + + if (atomic_dec_and_test(&server->active)) + deactivate_super(sb); } /* @@ -664,25 +675,6 @@ static void nfs_umount_begin(struct super_block *sb) } /* - * Set the port number in an address. Be agnostic about the address family. - */ -static void nfs_set_port(struct sockaddr *sap, unsigned short port) -{ - switch (sap->sa_family) { - case AF_INET: { - struct sockaddr_in *ap = (struct sockaddr_in *)sap; - ap->sin_port = htons(port); - break; - } - case AF_INET6: { - struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap; - ap->sin6_port = htons(port); - break; - } - } -} - -/* * Sanity-check a server address provided by the mount command. * * Address family must be initialized, and address must not be @@ -724,20 +716,22 @@ static void nfs_parse_ipv4_address(char *string, size_t str_len, *addr_len = 0; } -#define IPV6_SCOPE_DELIMITER '%' - #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len, - const char *delim, - struct sockaddr_in6 *sin6) +static int nfs_parse_ipv6_scope_id(const char *string, const size_t str_len, + const char *delim, + struct sockaddr_in6 *sin6) { char *p; size_t len; - if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) - return ; + if ((string + str_len) == delim) + return 1; + if (*delim != IPV6_SCOPE_DELIMITER) - return; + return 0; + + if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) + return 0; len = (string + str_len) - delim - 1; p = kstrndup(delim + 1, len, GFP_KERNEL); @@ -750,14 +744,20 @@ static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len, scope_id = dev->ifindex; dev_put(dev); } else { - /* scope_id is set to zero on error */ - strict_strtoul(p, 10, &scope_id); + if (strict_strtoul(p, 10, &scope_id) == 0) { + kfree(p); + return 0; + } } kfree(p); + sin6->sin6_scope_id = scope_id; dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id); + return 1; } + + return 0; } static void nfs_parse_ipv6_address(char *string, size_t str_len, @@ -773,9 +773,11 @@ static void nfs_parse_ipv6_address(char *string, size_t str_len, sin6->sin6_family = AF_INET6; *addr_len = sizeof(*sin6); - if (in6_pton(string, str_len, addr, IPV6_SCOPE_DELIMITER, &delim)) { - nfs_parse_ipv6_scope_id(string, str_len, delim, sin6); - return; + if (in6_pton(string, str_len, addr, + IPV6_SCOPE_DELIMITER, &delim) != 0) { + if (nfs_parse_ipv6_scope_id(string, str_len, + delim, sin6) != 0) + return; } } @@ -798,7 +800,7 @@ static void nfs_parse_ipv6_address(char *string, size_t str_len, * If there is a problem constructing the new sockaddr, set the address * family to AF_UNSPEC. */ -static void nfs_parse_ip_address(char *string, size_t str_len, +void nfs_parse_ip_address(char *string, size_t str_len, struct sockaddr *sap, size_t *addr_len) { unsigned int i, colons; @@ -1258,6 +1260,30 @@ static int nfs_parse_mount_options(char *raw, &mnt->mount_server.addrlen); kfree(string); break; + case Opt_lookupcache: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + token = match_token(string, + nfs_lookupcache_tokens, args); + kfree(string); + switch (token) { + case Opt_lookupcache_all: + mnt->flags &= ~(NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE); + break; + case Opt_lookupcache_positive: + mnt->flags &= ~NFS_MOUNT_LOOKUP_CACHE_NONE; + mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG; + break; + case Opt_lookupcache_none: + mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE; + break; + default: + errors++; + dfprintk(MOUNT, "NFS: invalid " + "lookupcache argument\n"); + }; + break; /* * Special options @@ -1558,7 +1584,7 @@ static int nfs_validate_mount_data(void *options, * Translate to nfs_parsed_mount_data, which nfs_fill_super * can deal with. */ - args->flags = data->flags; + args->flags = data->flags & NFS_MOUNT_FLAGMASK; args->rsize = data->rsize; args->wsize = data->wsize; args->timeo = data->timeo; @@ -2433,7 +2459,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags, compare_super = NULL; /* Get a superblock - note that we may end up sharing one that already exists */ - s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata); + s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata); if (IS_ERR(s)) { error = PTR_ERR(s); goto out_err_nosb; @@ -2518,7 +2544,7 @@ static int nfs4_referral_get_sb(struct file_system_type *fs_type, int flags, compare_super = NULL; /* Get a superblock - note that we may end up sharing one that already exists */ - s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata); + s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata); if (IS_ERR(s)) { error = PTR_ERR(s); goto out_err_nosb; diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index f089e5839d7d..ecc295347775 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -99,7 +99,7 @@ static void nfs_async_unlink_release(void *calldata) nfs_dec_sillycount(data->dir); nfs_free_unlinkdata(data); - nfs_sb_deactive(NFS_SB(sb)); + nfs_sb_deactive(sb); } static const struct rpc_call_ops nfs_unlink_ops = { @@ -118,6 +118,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n .rpc_message = &msg, .callback_ops = &nfs_unlink_ops, .callback_data = data, + .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; struct rpc_task *task; @@ -149,7 +150,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n nfs_dec_sillycount(dir); return 0; } - nfs_sb_active(NFS_SERVER(dir)); + nfs_sb_active(dir->i_sb); data->args.fh = NFS_FH(dir); nfs_fattr_init(&data->res.dir_attr); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 3229e217c773..9f9845859fc1 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1427,8 +1427,9 @@ static int nfs_write_mapping(struct address_space *mapping, int how) .bdi = mapping->backing_dev_info, .sync_mode = WB_SYNC_NONE, .nr_to_write = LONG_MAX, + .range_start = 0, + .range_end = LLONG_MAX, .for_writepages = 1, - .range_cyclic = 1, }; int ret; diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c index 64965e1c21c4..9b0efdad8910 100644 --- a/fs/nls/nls_base.c +++ b/fs/nls/nls_base.c @@ -13,9 +13,7 @@ #include <linux/nls.h> #include <linux/kernel.h> #include <linux/errno.h> -#ifdef CONFIG_KMOD #include <linux/kmod.h> -#endif #include <linux/spinlock.h> static struct nls_table default_table; @@ -215,24 +213,7 @@ static struct nls_table *find_nls(char *charset) struct nls_table *load_nls(char *charset) { - struct nls_table *nls; -#ifdef CONFIG_KMOD - int ret; -#endif - - nls = find_nls(charset); - if (nls) - return nls; - -#ifdef CONFIG_KMOD - ret = request_module("nls_%s", charset); - if (ret != 0) { - printk("Unable to load NLS charset %s\n", charset); - return NULL; - } - nls = find_nls(charset); -#endif - return nls; + return try_then_request_module(find_nls(charset), "nls_%s", charset); } void unload_nls(struct nls_table *nls) diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index d020866d4232..3140a4429af1 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -439,7 +439,7 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping, pages[nr] = *cached_page; page_cache_get(*cached_page); if (unlikely(!pagevec_add(lru_pvec, *cached_page))) - __pagevec_lru_add(lru_pvec); + __pagevec_lru_add_file(lru_pvec); *cached_page = NULL; } index++; @@ -2084,7 +2084,7 @@ err_out: OSYNC_METADATA|OSYNC_DATA); } } - pagevec_lru_add(&lru_pvec); + pagevec_lru_add_file(&lru_pvec); ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", written ? "written" : "status", (unsigned long)written, (long)status); diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c index 3d3e16631472..a97b477ac0fc 100644 --- a/fs/partitions/acorn.c +++ b/fs/partitions/acorn.c @@ -275,16 +275,6 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev) id = data[0x1fc] & 15; put_dev_sector(sect); -#ifdef CONFIG_BLK_DEV_MFM - if (MAJOR(bdev->bd_dev) == MFM_ACORN_MAJOR) { - extern void xd_set_geometry(struct block_device *, - unsigned char, unsigned char, unsigned int); - xd_set_geometry(bdev, dr->secspertrack, heads, 1); - invalidate_bh_lrus(); - truncate_inode_pages(bdev->bd_inode->i_mapping, 0); - } -#endif - /* * Work out start of non-adfs partition. */ diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 7408227c49c9..cfb0c80690aa 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -195,6 +195,14 @@ check_partition(struct gendisk *hd, struct block_device *bdev) return ERR_PTR(res); } +static ssize_t part_partition_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%d\n", p->partno); +} + static ssize_t part_start_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -260,6 +268,7 @@ ssize_t part_fail_store(struct device *dev, } #endif +static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL); static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); @@ -269,6 +278,7 @@ static struct device_attribute dev_attr_fail = #endif static struct attribute *part_attrs[] = { + &dev_attr_partition.attr, &dev_attr_start.attr, &dev_attr_size.attr, &dev_attr_stat.attr, @@ -538,10 +548,23 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev) sector_t from = state->parts[p].from; if (!size) continue; + if (from >= get_capacity(disk)) { + printk(KERN_WARNING + "%s: p%d ignored, start %llu is behind the end of the disk\n", + disk->disk_name, p, (unsigned long long) from); + continue; + } if (from + size > get_capacity(disk)) { + /* + * we can not ignore partitions of broken tables + * created by for example camera firmware, but we + * limit them to the end of the disk to avoid + * creating invalid block devices + */ printk(KERN_WARNING - "%s: p%d exceeds device capacity\n", - disk->disk_name, p); + "%s: p%d size %llu limited to end of disk\n", + disk->disk_name, p, (unsigned long long) size); + size = get_capacity(disk) - from; } res = add_partition(disk, p, from, size, state->parts[p].flags); if (res) { diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index b675a49c1823..61b25f4eabe6 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -45,7 +45,6 @@ #include <linux/blkdev.h> #include <linux/hugetlb.h> #include <linux/jiffies.h> -#include <linux/sysrq.h> #include <linux/vmalloc.h> #include <linux/crash_dump.h> #include <linux/pid_namespace.h> @@ -137,6 +136,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off, unsigned long allowed; struct vmalloc_info vmi; long cached; + unsigned long pages[NR_LRU_LISTS]; + int lru; /* * display in kilobytes. @@ -155,51 +156,70 @@ static int meminfo_read_proc(char *page, char **start, off_t off, get_vmalloc_info(&vmi); + for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) + pages[lru] = global_page_state(NR_LRU_BASE + lru); + /* * Tagged format, for easy grepping and expansion. */ len = sprintf(page, - "MemTotal: %8lu kB\n" - "MemFree: %8lu kB\n" - "Buffers: %8lu kB\n" - "Cached: %8lu kB\n" - "SwapCached: %8lu kB\n" - "Active: %8lu kB\n" - "Inactive: %8lu kB\n" + "MemTotal: %8lu kB\n" + "MemFree: %8lu kB\n" + "Buffers: %8lu kB\n" + "Cached: %8lu kB\n" + "SwapCached: %8lu kB\n" + "Active: %8lu kB\n" + "Inactive: %8lu kB\n" + "Active(anon): %8lu kB\n" + "Inactive(anon): %8lu kB\n" + "Active(file): %8lu kB\n" + "Inactive(file): %8lu kB\n" +#ifdef CONFIG_UNEVICTABLE_LRU + "Unevictable: %8lu kB\n" + "Mlocked: %8lu kB\n" +#endif #ifdef CONFIG_HIGHMEM - "HighTotal: %8lu kB\n" - "HighFree: %8lu kB\n" - "LowTotal: %8lu kB\n" - "LowFree: %8lu kB\n" + "HighTotal: %8lu kB\n" + "HighFree: %8lu kB\n" + "LowTotal: %8lu kB\n" + "LowFree: %8lu kB\n" #endif - "SwapTotal: %8lu kB\n" - "SwapFree: %8lu kB\n" - "Dirty: %8lu kB\n" - "Writeback: %8lu kB\n" - "AnonPages: %8lu kB\n" - "Mapped: %8lu kB\n" - "Slab: %8lu kB\n" - "SReclaimable: %8lu kB\n" - "SUnreclaim: %8lu kB\n" - "PageTables: %8lu kB\n" + "SwapTotal: %8lu kB\n" + "SwapFree: %8lu kB\n" + "Dirty: %8lu kB\n" + "Writeback: %8lu kB\n" + "AnonPages: %8lu kB\n" + "Mapped: %8lu kB\n" + "Slab: %8lu kB\n" + "SReclaimable: %8lu kB\n" + "SUnreclaim: %8lu kB\n" + "PageTables: %8lu kB\n" #ifdef CONFIG_QUICKLIST - "Quicklists: %8lu kB\n" + "Quicklists: %8lu kB\n" #endif - "NFS_Unstable: %8lu kB\n" - "Bounce: %8lu kB\n" - "WritebackTmp: %8lu kB\n" - "CommitLimit: %8lu kB\n" - "Committed_AS: %8lu kB\n" - "VmallocTotal: %8lu kB\n" - "VmallocUsed: %8lu kB\n" - "VmallocChunk: %8lu kB\n", + "NFS_Unstable: %8lu kB\n" + "Bounce: %8lu kB\n" + "WritebackTmp: %8lu kB\n" + "CommitLimit: %8lu kB\n" + "Committed_AS: %8lu kB\n" + "VmallocTotal: %8lu kB\n" + "VmallocUsed: %8lu kB\n" + "VmallocChunk: %8lu kB\n", K(i.totalram), K(i.freeram), K(i.bufferram), K(cached), K(total_swapcache_pages), - K(global_page_state(NR_ACTIVE)), - K(global_page_state(NR_INACTIVE)), + K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]), + K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]), + K(pages[LRU_ACTIVE_ANON]), + K(pages[LRU_INACTIVE_ANON]), + K(pages[LRU_ACTIVE_FILE]), + K(pages[LRU_INACTIVE_FILE]), +#ifdef CONFIG_UNEVICTABLE_LRU + K(pages[LRU_UNEVICTABLE]), + K(global_page_state(NR_MLOCK)), +#endif #ifdef CONFIG_HIGHMEM K(i.totalhigh), K(i.freehigh), @@ -704,28 +724,6 @@ static int execdomains_read_proc(char *page, char **start, off_t off, return proc_calc_metrics(page, start, off, count, eof, len); } -#ifdef CONFIG_MAGIC_SYSRQ -/* - * writing 'C' to /proc/sysrq-trigger is like sysrq-C - */ -static ssize_t write_sysrq_trigger(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - if (count) { - char c; - - if (get_user(c, buf)) - return -EFAULT; - __handle_sysrq(c, NULL, 0); - } - return count; -} - -static const struct file_operations proc_sysrq_trigger_operations = { - .write = write_sysrq_trigger, -}; -#endif - #ifdef CONFIG_PROC_PAGE_MONITOR #define KPMSIZE sizeof(u64) #define KPMMASK (KPMSIZE - 1) @@ -934,7 +932,4 @@ void __init proc_misc_init(void) #ifdef CONFIG_PROC_VMCORE proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations); #endif -#ifdef CONFIG_MAGIC_SYSRQ - proc_create("sysrq-trigger", S_IWUSR, NULL, &proc_sysrq_trigger_operations); -#endif } diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 841368b87a29..cd9ca67f841b 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -32,9 +32,6 @@ static size_t elfcorebuf_sz; /* Total size of vmcore file. */ static u64 vmcore_size; -/* Stores the physical address of elf header of crash image. */ -unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; - struct proc_dir_entry *proc_vmcore = NULL; /* Reads a page from the oldmem device from given offset. */ @@ -647,7 +644,7 @@ static int __init vmcore_init(void) int rc = 0; /* If elfcorehdr= has been passed in cmdline, then capture the dump.*/ - if (!(elfcorehdr_addr < ELFCORE_ADDR_MAX)) + if (!(is_vmcore_usable())) return rc; rc = parse_crash_elf_headers(); if (rc) { diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 5145cb9125af..76acdbc34611 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -112,12 +112,12 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) goto add_error; if (!pagevec_add(&lru_pvec, page)) - __pagevec_lru_add(&lru_pvec); + __pagevec_lru_add_file(&lru_pvec); unlock_page(page); } - pagevec_lru_add(&lru_pvec); + pagevec_lru_add_file(&lru_pvec); return 0; fsize_exceeded: diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index b13123424e49..f031d1c925f0 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -61,6 +61,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev) inode->i_mapping->a_ops = &ramfs_aops; inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); + mapping_set_unevictable(inode->i_mapping); inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; switch (mode & S_IFMT) { default: diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index b9dbeeca7049..37173fa07d15 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -8,8 +8,6 @@ /* proc info support a la one created by Sizif@Botik.RU for PGC */ -/* $Id: procfs.c,v 1.1.8.2 2001/07/15 17:08:42 god Exp $ */ - #include <linux/module.h> #include <linux/time.h> #include <linux/seq_file.h> @@ -621,7 +619,6 @@ int reiserfs_global_version_in_proc(char *buffer, char **start, #endif /* - * $Log: procfs.c,v $ * Revision 1.1.8.2 2001/07/15 17:08:42 god * . use get_super() in procfs.c * . remove remove_save_link() from reiserfs_do_truncate() diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index bb3cb5b7cdb2..ad92461cbfc3 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -155,7 +155,7 @@ static struct dentry *get_xa_file_dentry(const struct inode *inode, xadir = open_xa_dir(inode, flags); if (IS_ERR(xadir)) { return ERR_CAST(xadir); - } else if (xadir && !xadir->d_inode) { + } else if (!xadir->d_inode) { dput(xadir); return ERR_PTR(-ENODATA); } diff --git a/fs/seq_file.c b/fs/seq_file.c index bd20f7f5a933..eba2eabcd2b8 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -452,17 +452,34 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc) int seq_bitmap(struct seq_file *m, unsigned long *bits, unsigned int nr_bits) { - size_t len = bitmap_scnprintf_len(nr_bits); + if (m->count < m->size) { + int len = bitmap_scnprintf(m->buf + m->count, + m->size - m->count, bits, nr_bits); + if (m->count + len < m->size) { + m->count += len; + return 0; + } + } + m->count = m->size; + return -1; +} +EXPORT_SYMBOL(seq_bitmap); - if (m->count + len < m->size) { - bitmap_scnprintf(m->buf + m->count, m->size - m->count, - bits, nr_bits); - m->count += len; - return 0; +int seq_bitmap_list(struct seq_file *m, unsigned long *bits, + unsigned int nr_bits) +{ + if (m->count < m->size) { + int len = bitmap_scnlistprintf(m->buf + m->count, + m->size - m->count, bits, nr_bits); + if (m->count + len < m->size) { + m->count += len; + return 0; + } } m->count = m->size; return -1; } +EXPORT_SYMBOL(seq_bitmap_list); static void *single_start(struct seq_file *p, loff_t *pos) { diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index 006fc64227dd..66f6e58a7e4b 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -61,6 +61,7 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off) int size = dentry->d_inode->i_size; loff_t offs = *off; int count = min_t(size_t, bytes, PAGE_SIZE); + char *temp; if (size) { if (offs > size) @@ -69,23 +70,33 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off) count = size - offs; } + temp = kmalloc(count, GFP_KERNEL); + if (!temp) + return -ENOMEM; + mutex_lock(&bb->mutex); count = fill_read(dentry, bb->buffer, offs, count); - if (count < 0) - goto out_unlock; + if (count < 0) { + mutex_unlock(&bb->mutex); + goto out_free; + } - if (copy_to_user(userbuf, bb->buffer, count)) { + memcpy(temp, bb->buffer, count); + + mutex_unlock(&bb->mutex); + + if (copy_to_user(userbuf, temp, count)) { count = -EFAULT; - goto out_unlock; + goto out_free; } pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count); *off = offs + count; - out_unlock: - mutex_unlock(&bb->mutex); + out_free: + kfree(temp); return count; } @@ -118,6 +129,7 @@ static ssize_t write(struct file *file, const char __user *userbuf, int size = dentry->d_inode->i_size; loff_t offs = *off; int count = min_t(size_t, bytes, PAGE_SIZE); + char *temp; if (size) { if (offs > size) @@ -126,19 +138,27 @@ static ssize_t write(struct file *file, const char __user *userbuf, count = size - offs; } - mutex_lock(&bb->mutex); + temp = kmalloc(count, GFP_KERNEL); + if (!temp) + return -ENOMEM; - if (copy_from_user(bb->buffer, userbuf, count)) { + if (copy_from_user(temp, userbuf, count)) { count = -EFAULT; - goto out_unlock; + goto out_free; } + mutex_lock(&bb->mutex); + + memcpy(bb->buffer, temp, count); + count = flush_write(dentry, bb->buffer, offs, count); + mutex_unlock(&bb->mutex); + if (count > 0) *off = offs + count; - out_unlock: - mutex_unlock(&bb->mutex); +out_free: + kfree(temp); return count; } diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index aedaeba82ae5..3a05a596e3b4 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -370,17 +370,17 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, memset(acxt, 0, sizeof(*acxt)); acxt->parent_sd = parent_sd; - /* Lookup parent inode. inode initialization and I_NEW - * clearing are protected by sysfs_mutex. By grabbing it and - * looking up with _nowait variant, inode state can be - * determined reliably. + /* Lookup parent inode. inode initialization is protected by + * sysfs_mutex, so inode existence can be determined by + * looking up inode while holding sysfs_mutex. */ mutex_lock(&sysfs_mutex); - inode = ilookup5_nowait(sysfs_sb, parent_sd->s_ino, sysfs_ilookup_test, - parent_sd); + inode = ilookup5(sysfs_sb, parent_sd->s_ino, sysfs_ilookup_test, + parent_sd); + if (inode) { + WARN_ON(inode->i_state & I_NEW); - if (inode && !(inode->i_state & I_NEW)) { /* parent inode available */ acxt->parent_inode = inode; @@ -393,8 +393,7 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, mutex_lock(&inode->i_mutex); mutex_lock(&sysfs_mutex); } - } else - iput(inode); + } } /** @@ -636,6 +635,7 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, return sd; } +EXPORT_SYMBOL_GPL(sysfs_get_dirent); static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, const char *name, struct sysfs_dirent **p_sd) @@ -829,16 +829,12 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name) if (!new_dentry) goto out_unlock; - /* rename kobject and sysfs_dirent */ + /* rename sysfs_dirent */ error = -ENOMEM; new_name = dup_name = kstrdup(new_name, GFP_KERNEL); if (!new_name) goto out_unlock; - error = kobject_set_name(kobj, "%s", new_name); - if (error) - goto out_unlock; - dup_name = sd->s_name; sd->s_name = new_name; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index c9e4e5091da1..1f4a3f877262 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -19,10 +19,18 @@ #include <linux/poll.h> #include <linux/list.h> #include <linux/mutex.h> +#include <linux/limits.h> #include <asm/uaccess.h> #include "sysfs.h" +/* used in crash dumps to help with debugging */ +static char last_sysfs_file[PATH_MAX]; +void sysfs_printk_last_file(void) +{ + printk(KERN_EMERG "last sysfs file: %s\n", last_sysfs_file); +} + /* * There's one sysfs_buffer for each open file and one * sysfs_open_dirent for each sysfs_dirent with one or more open @@ -328,6 +336,11 @@ static int sysfs_open_file(struct inode *inode, struct file *file) struct sysfs_buffer *buffer; struct sysfs_ops *ops; int error = -EACCES; + char *p; + + p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file)); + if (p) + memmove(last_sysfs_file, p, strlen(p) + 1); /* need attr_sd for attr and ops, its parent for kobj */ if (!sysfs_get_active_two(attr_sd)) @@ -440,7 +453,23 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait) return POLLERR|POLLPRI; } -void sysfs_notify(struct kobject *k, char *dir, char *attr) +void sysfs_notify_dirent(struct sysfs_dirent *sd) +{ + struct sysfs_open_dirent *od; + + spin_lock(&sysfs_open_dirent_lock); + + od = sd->s_attr.open; + if (od) { + atomic_inc(&od->event); + wake_up_interruptible(&od->poll); + } + + spin_unlock(&sysfs_open_dirent_lock); +} +EXPORT_SYMBOL_GPL(sysfs_notify_dirent); + +void sysfs_notify(struct kobject *k, const char *dir, const char *attr) { struct sysfs_dirent *sd = k->sd; @@ -450,19 +479,8 @@ void sysfs_notify(struct kobject *k, char *dir, char *attr) sd = sysfs_find_dirent(sd, dir); if (sd && attr) sd = sysfs_find_dirent(sd, attr); - if (sd) { - struct sysfs_open_dirent *od; - - spin_lock(&sysfs_open_dirent_lock); - - od = sd->s_attr.open; - if (od) { - atomic_inc(&od->event); - wake_up_interruptible(&od->poll); - } - - spin_unlock(&sysfs_open_dirent_lock); - } + if (sd) + sysfs_notify_dirent(sd); mutex_unlock(&sysfs_mutex); } diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 14f0023984d7..ab343e371d64 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -16,6 +16,7 @@ #include <linux/mount.h> #include <linux/pagemap.h> #include <linux/init.h> +#include <linux/module.h> #include "sysfs.h" @@ -115,3 +116,17 @@ out_err: sysfs_dir_cachep = NULL; goto out; } + +#undef sysfs_get +struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd) +{ + return __sysfs_get(sd); +} +EXPORT_SYMBOL_GPL(sysfs_get); + +#undef sysfs_put +void sysfs_put(struct sysfs_dirent *sd) +{ + __sysfs_put(sd); +} +EXPORT_SYMBOL_GPL(sysfs_put); diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index a5db496f71c7..93c6d6b27c4d 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -124,7 +124,7 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name, struct sysfs_dirent **p_sd); void sysfs_remove_subdir(struct sysfs_dirent *sd); -static inline struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd) +static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd) { if (sd) { WARN_ON(!atomic_read(&sd->s_count)); @@ -132,12 +132,14 @@ static inline struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd) } return sd; } +#define sysfs_get(sd) __sysfs_get(sd) -static inline void sysfs_put(struct sysfs_dirent *sd) +static inline void __sysfs_put(struct sysfs_dirent *sd) { if (sd && atomic_dec_and_test(&sd->s_count)) release_sysfs_dirent(sd); } +#define sysfs_put(sd) __sysfs_put(sd) /* * inode.c diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index 73db464cd08b..1a4973e10664 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -414,19 +414,21 @@ static int do_budget_space(struct ubifs_info *c) * @c->lst.empty_lebs + @c->freeable_cnt + @c->idx_gc_cnt - * @c->lst.taken_empty_lebs * - * @empty_lebs are available because they are empty. @freeable_cnt are - * available because they contain only free and dirty space and the - * index allocation always occurs after wbufs are synch'ed. - * @idx_gc_cnt are available because they are index LEBs that have been - * garbage collected (including trivial GC) and are awaiting the commit - * before they can be unmapped - note that the in-the-gaps method will - * grab these if it needs them. @taken_empty_lebs are empty_lebs that - * have already been allocated for some purpose (also includes those - * LEBs on the @idx_gc list). + * @c->lst.empty_lebs are available because they are empty. + * @c->freeable_cnt are available because they contain only free and + * dirty space, @c->idx_gc_cnt are available because they are index + * LEBs that have been garbage collected and are awaiting the commit + * before they can be used. And the in-the-gaps method will grab these + * if it needs them. @c->lst.taken_empty_lebs are empty LEBs that have + * already been allocated for some purpose. * - * Note, @taken_empty_lebs may temporarily be higher by one because of - * the way we serialize LEB allocations and budgeting. See a comment in - * 'ubifs_find_free_space()'. + * Note, @c->idx_gc_cnt is included to both @c->lst.empty_lebs (because + * these LEBs are empty) and to @c->lst.taken_empty_lebs (because they + * are taken until after the commit). + * + * Note, @c->lst.taken_empty_lebs may temporarily be higher by one + * because of the way we serialize LEB allocations and budgeting. See a + * comment in 'ubifs_find_free_space()'. */ lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - c->lst.taken_empty_lebs; diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c index 5bb51dac3c16..a0ada596b17c 100644 --- a/fs/ubifs/compress.c +++ b/fs/ubifs/compress.c @@ -91,8 +91,6 @@ struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; * * Note, if the input buffer was not compressed, it is copied to the output * buffer and %UBIFS_COMPR_NONE is returned in @compr_type. - * - * This functions returns %0 on success or a negative error code on failure. */ void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len, int *compr_type) diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index d7f7645779f2..7186400750e7 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -222,30 +222,38 @@ void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode) { const struct ubifs_inode *ui = ubifs_inode(inode); - printk(KERN_DEBUG "inode %lu\n", inode->i_ino); - printk(KERN_DEBUG "size %llu\n", + printk(KERN_DEBUG "Dump in-memory inode:"); + printk(KERN_DEBUG "\tinode %lu\n", inode->i_ino); + printk(KERN_DEBUG "\tsize %llu\n", (unsigned long long)i_size_read(inode)); - printk(KERN_DEBUG "nlink %u\n", inode->i_nlink); - printk(KERN_DEBUG "uid %u\n", (unsigned int)inode->i_uid); - printk(KERN_DEBUG "gid %u\n", (unsigned int)inode->i_gid); - printk(KERN_DEBUG "atime %u.%u\n", + printk(KERN_DEBUG "\tnlink %u\n", inode->i_nlink); + printk(KERN_DEBUG "\tuid %u\n", (unsigned int)inode->i_uid); + printk(KERN_DEBUG "\tgid %u\n", (unsigned int)inode->i_gid); + printk(KERN_DEBUG "\tatime %u.%u\n", (unsigned int)inode->i_atime.tv_sec, (unsigned int)inode->i_atime.tv_nsec); - printk(KERN_DEBUG "mtime %u.%u\n", + printk(KERN_DEBUG "\tmtime %u.%u\n", (unsigned int)inode->i_mtime.tv_sec, (unsigned int)inode->i_mtime.tv_nsec); - printk(KERN_DEBUG "ctime %u.%u\n", + printk(KERN_DEBUG "\tctime %u.%u\n", (unsigned int)inode->i_ctime.tv_sec, (unsigned int)inode->i_ctime.tv_nsec); - printk(KERN_DEBUG "creat_sqnum %llu\n", ui->creat_sqnum); - printk(KERN_DEBUG "xattr_size %u\n", ui->xattr_size); - printk(KERN_DEBUG "xattr_cnt %u\n", ui->xattr_cnt); - printk(KERN_DEBUG "xattr_names %u\n", ui->xattr_names); - printk(KERN_DEBUG "dirty %u\n", ui->dirty); - printk(KERN_DEBUG "xattr %u\n", ui->xattr); - printk(KERN_DEBUG "flags %d\n", ui->flags); - printk(KERN_DEBUG "compr_type %d\n", ui->compr_type); - printk(KERN_DEBUG "data_len %d\n", ui->data_len); + printk(KERN_DEBUG "\tcreat_sqnum %llu\n", ui->creat_sqnum); + printk(KERN_DEBUG "\txattr_size %u\n", ui->xattr_size); + printk(KERN_DEBUG "\txattr_cnt %u\n", ui->xattr_cnt); + printk(KERN_DEBUG "\txattr_names %u\n", ui->xattr_names); + printk(KERN_DEBUG "\tdirty %u\n", ui->dirty); + printk(KERN_DEBUG "\txattr %u\n", ui->xattr); + printk(KERN_DEBUG "\tbulk_read %u\n", ui->xattr); + printk(KERN_DEBUG "\tsynced_i_size %llu\n", + (unsigned long long)ui->synced_i_size); + printk(KERN_DEBUG "\tui_size %llu\n", + (unsigned long long)ui->ui_size); + printk(KERN_DEBUG "\tflags %d\n", ui->flags); + printk(KERN_DEBUG "\tcompr_type %d\n", ui->compr_type); + printk(KERN_DEBUG "\tlast_page_read %lu\n", ui->last_page_read); + printk(KERN_DEBUG "\tread_in_a_row %lu\n", ui->read_in_a_row); + printk(KERN_DEBUG "\tdata_len %d\n", ui->data_len); } void dbg_dump_node(const struct ubifs_info *c, const void *node) @@ -647,6 +655,43 @@ void dbg_dump_lprops(struct ubifs_info *c) } } +void dbg_dump_lpt_info(struct ubifs_info *c) +{ + int i; + + spin_lock(&dbg_lock); + printk(KERN_DEBUG "\tlpt_sz: %lld\n", c->lpt_sz); + printk(KERN_DEBUG "\tpnode_sz: %d\n", c->pnode_sz); + printk(KERN_DEBUG "\tnnode_sz: %d\n", c->nnode_sz); + printk(KERN_DEBUG "\tltab_sz: %d\n", c->ltab_sz); + printk(KERN_DEBUG "\tlsave_sz: %d\n", c->lsave_sz); + printk(KERN_DEBUG "\tbig_lpt: %d\n", c->big_lpt); + printk(KERN_DEBUG "\tlpt_hght: %d\n", c->lpt_hght); + printk(KERN_DEBUG "\tpnode_cnt: %d\n", c->pnode_cnt); + printk(KERN_DEBUG "\tnnode_cnt: %d\n", c->nnode_cnt); + printk(KERN_DEBUG "\tdirty_pn_cnt: %d\n", c->dirty_pn_cnt); + printk(KERN_DEBUG "\tdirty_nn_cnt: %d\n", c->dirty_nn_cnt); + printk(KERN_DEBUG "\tlsave_cnt: %d\n", c->lsave_cnt); + printk(KERN_DEBUG "\tspace_bits: %d\n", c->space_bits); + printk(KERN_DEBUG "\tlpt_lnum_bits: %d\n", c->lpt_lnum_bits); + printk(KERN_DEBUG "\tlpt_offs_bits: %d\n", c->lpt_offs_bits); + printk(KERN_DEBUG "\tlpt_spc_bits: %d\n", c->lpt_spc_bits); + printk(KERN_DEBUG "\tpcnt_bits: %d\n", c->pcnt_bits); + printk(KERN_DEBUG "\tlnum_bits: %d\n", c->lnum_bits); + printk(KERN_DEBUG "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs); + printk(KERN_DEBUG "\tLPT head is at %d:%d\n", + c->nhead_lnum, c->nhead_offs); + printk(KERN_DEBUG "\tLPT ltab is at %d:%d\n", c->ltab_lnum, c->ltab_offs); + if (c->big_lpt) + printk(KERN_DEBUG "\tLPT lsave is at %d:%d\n", + c->lsave_lnum, c->lsave_offs); + for (i = 0; i < c->lpt_lebs; i++) + printk(KERN_DEBUG "\tLPT LEB %d free %d dirty %d tgc %d " + "cmt %d\n", i + c->lpt_first, c->ltab[i].free, + c->ltab[i].dirty, c->ltab[i].tgc, c->ltab[i].cmt); + spin_unlock(&dbg_lock); +} + void dbg_dump_leb(const struct ubifs_info *c, int lnum) { struct ubifs_scan_leb *sleb; diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index 50315fc57185..33d6b95071e4 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -224,6 +224,7 @@ void dbg_dump_lstats(const struct ubifs_lp_stats *lst); void dbg_dump_budg(struct ubifs_info *c); void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp); void dbg_dump_lprops(struct ubifs_info *c); +void dbg_dump_lpt_info(struct ubifs_info *c); void dbg_dump_leb(const struct ubifs_info *c, int lnum); void dbg_dump_znode(const struct ubifs_info *c, const struct ubifs_znode *znode); @@ -249,6 +250,8 @@ int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot); int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot); int dbg_check_cats(struct ubifs_info *c); int dbg_check_ltab(struct ubifs_info *c); +int dbg_chk_lpt_free_spc(struct ubifs_info *c); +int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len); int dbg_check_synced_i_size(struct inode *inode); int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir); int dbg_check_tnc(struct ubifs_info *c, int extra); @@ -367,6 +370,7 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum, #define dbg_dump_budg(c) ({}) #define dbg_dump_lprop(c, lp) ({}) #define dbg_dump_lprops(c) ({}) +#define dbg_dump_lpt_info(c) ({}) #define dbg_dump_leb(c, lnum) ({}) #define dbg_dump_znode(c, znode) ({}) #define dbg_dump_heap(c, heap, cat) ({}) @@ -379,6 +383,8 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum, #define dbg_check_old_index(c, zroot) 0 #define dbg_check_cats(c) 0 #define dbg_check_ltab(c) 0 +#define dbg_chk_lpt_free_spc(c) 0 +#define dbg_chk_lpt_sz(c, action, len) 0 #define dbg_check_synced_i_size(inode) 0 #define dbg_check_dir_size(c, dir) 0 #define dbg_check_tnc(c, x) 0 diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 3d698e2022b1..51cf511d44d9 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -147,6 +147,12 @@ static int do_readpage(struct page *page) err = ret; if (err != -ENOENT) break; + } else if (block + 1 == beyond) { + int dlen = le32_to_cpu(dn->size); + int ilen = i_size & (UBIFS_BLOCK_SIZE - 1); + + if (ilen && ilen < dlen) + memset(addr + ilen, 0, dlen - ilen); } } if (++i >= UBIFS_BLOCKS_PER_PAGE) @@ -577,8 +583,262 @@ out: return copied; } +/** + * populate_page - copy data nodes into a page for bulk-read. + * @c: UBIFS file-system description object + * @page: page + * @bu: bulk-read information + * @n: next zbranch slot + * + * This function returns %0 on success and a negative error code on failure. + */ +static int populate_page(struct ubifs_info *c, struct page *page, + struct bu_info *bu, int *n) +{ + int i = 0, nn = *n, offs = bu->zbranch[0].offs, hole = 0, read = 0; + struct inode *inode = page->mapping->host; + loff_t i_size = i_size_read(inode); + unsigned int page_block; + void *addr, *zaddr; + pgoff_t end_index; + + dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", + inode->i_ino, page->index, i_size, page->flags); + + addr = zaddr = kmap(page); + + end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; + if (!i_size || page->index > end_index) { + hole = 1; + memset(addr, 0, PAGE_CACHE_SIZE); + goto out_hole; + } + + page_block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; + while (1) { + int err, len, out_len, dlen; + + if (nn >= bu->cnt) { + hole = 1; + memset(addr, 0, UBIFS_BLOCK_SIZE); + } else if (key_block(c, &bu->zbranch[nn].key) == page_block) { + struct ubifs_data_node *dn; + + dn = bu->buf + (bu->zbranch[nn].offs - offs); + + ubifs_assert(dn->ch.sqnum > + ubifs_inode(inode)->creat_sqnum); + + len = le32_to_cpu(dn->size); + if (len <= 0 || len > UBIFS_BLOCK_SIZE) + goto out_err; + + dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + out_len = UBIFS_BLOCK_SIZE; + err = ubifs_decompress(&dn->data, dlen, addr, &out_len, + le16_to_cpu(dn->compr_type)); + if (err || len != out_len) + goto out_err; + + if (len < UBIFS_BLOCK_SIZE) + memset(addr + len, 0, UBIFS_BLOCK_SIZE - len); + + nn += 1; + read = (i << UBIFS_BLOCK_SHIFT) + len; + } else if (key_block(c, &bu->zbranch[nn].key) < page_block) { + nn += 1; + continue; + } else { + hole = 1; + memset(addr, 0, UBIFS_BLOCK_SIZE); + } + if (++i >= UBIFS_BLOCKS_PER_PAGE) + break; + addr += UBIFS_BLOCK_SIZE; + page_block += 1; + } + + if (end_index == page->index) { + int len = i_size & (PAGE_CACHE_SIZE - 1); + + if (len && len < read) + memset(zaddr + len, 0, read - len); + } + +out_hole: + if (hole) { + SetPageChecked(page); + dbg_gen("hole"); + } + + SetPageUptodate(page); + ClearPageError(page); + flush_dcache_page(page); + kunmap(page); + *n = nn; + return 0; + +out_err: + ClearPageUptodate(page); + SetPageError(page); + flush_dcache_page(page); + kunmap(page); + ubifs_err("bad data node (block %u, inode %lu)", + page_block, inode->i_ino); + return -EINVAL; +} + +/** + * ubifs_do_bulk_read - do bulk-read. + * @c: UBIFS file-system description object + * @page1: first page + * + * This function returns %1 if the bulk-read is done, otherwise %0 is returned. + */ +static int ubifs_do_bulk_read(struct ubifs_info *c, struct page *page1) +{ + pgoff_t offset = page1->index, end_index; + struct address_space *mapping = page1->mapping; + struct inode *inode = mapping->host; + struct ubifs_inode *ui = ubifs_inode(inode); + struct bu_info *bu; + int err, page_idx, page_cnt, ret = 0, n = 0; + loff_t isize; + + bu = kmalloc(sizeof(struct bu_info), GFP_NOFS); + if (!bu) + return 0; + + bu->buf_len = c->bulk_read_buf_size; + bu->buf = kmalloc(bu->buf_len, GFP_NOFS); + if (!bu->buf) + goto out_free; + + data_key_init(c, &bu->key, inode->i_ino, + offset << UBIFS_BLOCKS_PER_PAGE_SHIFT); + + err = ubifs_tnc_get_bu_keys(c, bu); + if (err) + goto out_warn; + + if (bu->eof) { + /* Turn off bulk-read at the end of the file */ + ui->read_in_a_row = 1; + ui->bulk_read = 0; + } + + page_cnt = bu->blk_cnt >> UBIFS_BLOCKS_PER_PAGE_SHIFT; + if (!page_cnt) { + /* + * This happens when there are multiple blocks per page and the + * blocks for the first page we are looking for, are not + * together. If all the pages were like this, bulk-read would + * reduce performance, so we turn it off for a while. + */ + ui->read_in_a_row = 0; + ui->bulk_read = 0; + goto out_free; + } + + if (bu->cnt) { + err = ubifs_tnc_bulk_read(c, bu); + if (err) + goto out_warn; + } + + err = populate_page(c, page1, bu, &n); + if (err) + goto out_warn; + + unlock_page(page1); + ret = 1; + + isize = i_size_read(inode); + if (isize == 0) + goto out_free; + end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); + + for (page_idx = 1; page_idx < page_cnt; page_idx++) { + pgoff_t page_offset = offset + page_idx; + struct page *page; + + if (page_offset > end_index) + break; + page = find_or_create_page(mapping, page_offset, + GFP_NOFS | __GFP_COLD); + if (!page) + break; + if (!PageUptodate(page)) + err = populate_page(c, page, bu, &n); + unlock_page(page); + page_cache_release(page); + if (err) + break; + } + + ui->last_page_read = offset + page_idx - 1; + +out_free: + kfree(bu->buf); + kfree(bu); + return ret; + +out_warn: + ubifs_warn("ignoring error %d and skipping bulk-read", err); + goto out_free; +} + +/** + * ubifs_bulk_read - determine whether to bulk-read and, if so, do it. + * @page: page from which to start bulk-read. + * + * Some flash media are capable of reading sequentially at faster rates. UBIFS + * bulk-read facility is designed to take advantage of that, by reading in one + * go consecutive data nodes that are also located consecutively in the same + * LEB. This function returns %1 if a bulk-read is done and %0 otherwise. + */ +static int ubifs_bulk_read(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct ubifs_info *c = inode->i_sb->s_fs_info; + struct ubifs_inode *ui = ubifs_inode(inode); + pgoff_t index = page->index, last_page_read = ui->last_page_read; + int ret = 0; + + ui->last_page_read = index; + + if (!c->bulk_read) + return 0; + /* + * Bulk-read is protected by ui_mutex, but it is an optimization, so + * don't bother if we cannot lock the mutex. + */ + if (!mutex_trylock(&ui->ui_mutex)) + return 0; + if (index != last_page_read + 1) { + /* Turn off bulk-read if we stop reading sequentially */ + ui->read_in_a_row = 1; + if (ui->bulk_read) + ui->bulk_read = 0; + goto out_unlock; + } + if (!ui->bulk_read) { + ui->read_in_a_row += 1; + if (ui->read_in_a_row < 3) + goto out_unlock; + /* Three reads in a row, so switch on bulk-read */ + ui->bulk_read = 1; + } + ret = ubifs_do_bulk_read(c, page); +out_unlock: + mutex_unlock(&ui->ui_mutex); + return ret; +} + static int ubifs_readpage(struct file *file, struct page *page) { + if (ubifs_bulk_read(page)) + return 0; do_readpage(page); unlock_page(page); return 0; diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index 47814cde2407..717d79c97c5e 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -901,11 +901,11 @@ static int get_idx_gc_leb(struct ubifs_info *c) * it is needed now for this commit. */ lp = ubifs_lpt_lookup_dirty(c, lnum); - if (unlikely(IS_ERR(lp))) + if (IS_ERR(lp)) return PTR_ERR(lp); lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC, lp->flags | LPROPS_INDEX, -1); - if (unlikely(IS_ERR(lp))) + if (IS_ERR(lp)) return PTR_ERR(lp); dbg_find("LEB %d, dirty %d and free %d flags %#x", lp->lnum, lp->dirty, lp->free, lp->flags); diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index 02aba36fe3d4..0bef6501d58a 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -96,6 +96,48 @@ static int switch_gc_head(struct ubifs_info *c) } /** + * joinup - bring data nodes for an inode together. + * @c: UBIFS file-system description object + * @sleb: describes scanned LEB + * @inum: inode number + * @blk: block number + * @data: list to which to add data nodes + * + * This function looks at the first few nodes in the scanned LEB @sleb and adds + * them to @data if they are data nodes from @inum and have a larger block + * number than @blk. This function returns %0 on success and a negative error + * code on failure. + */ +static int joinup(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ino_t inum, + unsigned int blk, struct list_head *data) +{ + int err, cnt = 6, lnum = sleb->lnum, offs; + struct ubifs_scan_node *snod, *tmp; + union ubifs_key *key; + + list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { + key = &snod->key; + if (key_inum(c, key) == inum && + key_type(c, key) == UBIFS_DATA_KEY && + key_block(c, key) > blk) { + offs = snod->offs; + err = ubifs_tnc_has_node(c, key, 0, lnum, offs, 0); + if (err < 0) + return err; + list_del(&snod->list); + if (err) { + list_add_tail(&snod->list, data); + blk = key_block(c, key); + } else + kfree(snod); + cnt = 6; + } else if (--cnt == 0) + break; + } + return 0; +} + +/** * move_nodes - move nodes. * @c: UBIFS file-system description object * @sleb: describes nodes to move @@ -116,16 +158,21 @@ static int switch_gc_head(struct ubifs_info *c) static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) { struct ubifs_scan_node *snod, *tmp; - struct list_head large, medium, small; + struct list_head data, large, medium, small; struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; int avail, err, min = INT_MAX; + unsigned int blk = 0; + ino_t inum = 0; + INIT_LIST_HEAD(&data); INIT_LIST_HEAD(&large); INIT_LIST_HEAD(&medium); INIT_LIST_HEAD(&small); - list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { - struct list_head *lst; + while (!list_empty(&sleb->nodes)) { + struct list_head *lst = sleb->nodes.next; + + snod = list_entry(lst, struct ubifs_scan_node, list); ubifs_assert(snod->type != UBIFS_IDX_NODE); ubifs_assert(snod->type != UBIFS_REF_NODE); @@ -136,7 +183,6 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) if (err < 0) goto out; - lst = &snod->list; list_del(lst); if (!err) { /* The node is obsolete, remove it from the list */ @@ -145,15 +191,30 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) } /* - * Sort the list of nodes so that large nodes go first, and - * small nodes go last. + * Sort the list of nodes so that data nodes go first, large + * nodes go second, and small nodes go last. */ - if (snod->len > MEDIUM_NODE_WM) - list_add(lst, &large); + if (key_type(c, &snod->key) == UBIFS_DATA_KEY) { + if (inum != key_inum(c, &snod->key)) { + if (inum) { + /* + * Try to move data nodes from the same + * inode together. + */ + err = joinup(c, sleb, inum, blk, &data); + if (err) + goto out; + } + inum = key_inum(c, &snod->key); + blk = key_block(c, &snod->key); + } + list_add_tail(lst, &data); + } else if (snod->len > MEDIUM_NODE_WM) + list_add_tail(lst, &large); else if (snod->len > SMALL_NODE_WM) - list_add(lst, &medium); + list_add_tail(lst, &medium); else - list_add(lst, &small); + list_add_tail(lst, &small); /* And find the smallest node */ if (snod->len < min) @@ -164,6 +225,7 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) * Join the tree lists so that we'd have one roughly sorted list * ('large' will be the head of the joined list). */ + list_splice(&data, &large); list_splice(&medium, large.prev); list_splice(&small, large.prev); @@ -653,7 +715,7 @@ int ubifs_gc_start_commit(struct ubifs_info *c) */ while (1) { lp = ubifs_fast_find_freeable(c); - if (unlikely(IS_ERR(lp))) { + if (IS_ERR(lp)) { err = PTR_ERR(lp); goto out; } @@ -665,7 +727,7 @@ int ubifs_gc_start_commit(struct ubifs_info *c) if (err) goto out; lp = ubifs_change_lp(c, lp, c->leb_size, 0, lp->flags, 0); - if (unlikely(IS_ERR(lp))) { + if (IS_ERR(lp)) { err = PTR_ERR(lp); goto out; } @@ -680,7 +742,7 @@ int ubifs_gc_start_commit(struct ubifs_info *c) /* Record index freeable LEBs for unmapping after commit */ while (1) { lp = ubifs_fast_find_frdi_idx(c); - if (unlikely(IS_ERR(lp))) { + if (IS_ERR(lp)) { err = PTR_ERR(lp); goto out; } @@ -696,7 +758,7 @@ int ubifs_gc_start_commit(struct ubifs_info *c) /* Don't release the LEB until after the next commit */ flags = (lp->flags | LPROPS_TAKEN) ^ LPROPS_INDEX; lp = ubifs_change_lp(c, lp, c->leb_size, 0, flags, 1); - if (unlikely(IS_ERR(lp))) { + if (IS_ERR(lp)) { err = PTR_ERR(lp); kfree(idx_gc); goto out; diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 054363f2b207..01682713af69 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -62,6 +62,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err) { if (!c->ro_media) { c->ro_media = 1; + c->no_chk_data_crc = 0; ubifs_warn("switched to read-only mode, error %d", err); dbg_dump_stack(); } @@ -74,6 +75,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err) * @lnum: logical eraseblock number * @offs: offset within the logical eraseblock * @quiet: print no messages + * @chk_crc: indicates whether to always check the CRC * * This function checks node magic number and CRC checksum. This function also * validates node length to prevent UBIFS from becoming crazy when an attacker @@ -85,7 +87,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err) * or magic. */ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, - int offs, int quiet) + int offs, int quiet, int chk_crc) { int err = -EINVAL, type, node_len; uint32_t crc, node_crc, magic; @@ -121,6 +123,10 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, node_len > c->ranges[type].max_len) goto out_len; + if (!chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc) + if (c->no_chk_data_crc) + return 0; + crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); node_crc = le32_to_cpu(ch->crc); if (crc != node_crc) { @@ -722,7 +728,7 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, goto out; } - err = ubifs_check_node(c, buf, lnum, offs, 0); + err = ubifs_check_node(c, buf, lnum, offs, 0, 0); if (err) { ubifs_err("expected node type %d", type); return err; @@ -781,7 +787,7 @@ int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len, goto out; } - err = ubifs_check_node(c, buf, lnum, offs, 0); + err = ubifs_check_node(c, buf, lnum, offs, 0, 0); if (err) { ubifs_err("expected node type %d", type); return err; diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h index 8f7476007549..9ee65086f627 100644 --- a/fs/ubifs/key.h +++ b/fs/ubifs/key.h @@ -484,7 +484,7 @@ static inline void key_copy(const struct ubifs_info *c, * @key2: the second key to compare * * This function compares 2 keys and returns %-1 if @key1 is less than - * @key2, 0 if the keys are equivalent and %1 if @key1 is greater than @key2. + * @key2, %0 if the keys are equivalent and %1 if @key1 is greater than @key2. */ static inline int keys_cmp(const struct ubifs_info *c, const union ubifs_key *key1, @@ -503,6 +503,26 @@ static inline int keys_cmp(const struct ubifs_info *c, } /** + * keys_eq - determine if keys are equivalent. + * @c: UBIFS file-system description object + * @key1: the first key to compare + * @key2: the second key to compare + * + * This function compares 2 keys and returns %1 if @key1 is equal to @key2 and + * %0 if not. + */ +static inline int keys_eq(const struct ubifs_info *c, + const union ubifs_key *key1, + const union ubifs_key *key2) +{ + if (key1->u32[0] != key2->u32[0]) + return 0; + if (key1->u32[1] != key2->u32[1]) + return 0; + return 1; +} + +/** * is_hash_key - is a key vulnerable to hash collisions. * @c: UBIFS file-system description object * @key: key diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index 2ba93da71b65..f27176e9b70d 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -125,6 +125,7 @@ static void adjust_lpt_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, } } } + /* Not greater than parent, so compare to children */ while (1) { /* Compare to left child */ @@ -460,18 +461,6 @@ static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops) } /** - * ubifs_get_lprops - get reference to LEB properties. - * @c: the UBIFS file-system description object - * - * This function locks lprops. Lprops have to be unlocked by - * 'ubifs_release_lprops()'. - */ -void ubifs_get_lprops(struct ubifs_info *c) -{ - mutex_lock(&c->lp_mutex); -} - -/** * calc_dark - calculate LEB dark space size. * @c: the UBIFS file-system description object * @spc: amount of free and dirty space in the LEB @@ -576,7 +565,6 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, ubifs_assert(!(lprops->free & 7) && !(lprops->dirty & 7)); spin_lock(&c->space_lock); - if ((lprops->flags & LPROPS_TAKEN) && lprops->free == c->leb_size) c->lst.taken_empty_lebs -= 1; @@ -637,31 +625,12 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, c->lst.taken_empty_lebs += 1; change_category(c, lprops); - c->idx_gc_cnt += idx_gc_cnt; - spin_unlock(&c->space_lock); - return lprops; } /** - * ubifs_release_lprops - release lprops lock. - * @c: the UBIFS file-system description object - * - * This function has to be called after each 'ubifs_get_lprops()' call to - * unlock lprops. - */ -void ubifs_release_lprops(struct ubifs_info *c) -{ - ubifs_assert(mutex_is_locked(&c->lp_mutex)); - ubifs_assert(c->lst.empty_lebs >= 0 && - c->lst.empty_lebs <= c->main_lebs); - - mutex_unlock(&c->lp_mutex); -} - -/** * ubifs_get_lp_stats - get lprops statistics. * @c: UBIFS file-system description object * @st: return statistics @@ -1262,7 +1231,6 @@ static int scan_check_cb(struct ubifs_info *c, } ubifs_scan_destroy(sleb); - return LPT_SCAN_CONTINUE; out_print: diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c index 9ff2463177e5..db8bd0e518b2 100644 --- a/fs/ubifs/lpt.c +++ b/fs/ubifs/lpt.c @@ -109,7 +109,8 @@ static void do_calc_lpt_geom(struct ubifs_info *c) c->lpt_sz = (long long)c->pnode_cnt * c->pnode_sz; c->lpt_sz += (long long)c->nnode_cnt * c->nnode_sz; c->lpt_sz += c->ltab_sz; - c->lpt_sz += c->lsave_sz; + if (c->big_lpt) + c->lpt_sz += c->lsave_sz; /* Add wastage */ sz = c->lpt_sz; @@ -287,25 +288,56 @@ uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits) const int k = 32 - nrbits; uint8_t *p = *addr; int b = *pos; - uint32_t val; + uint32_t uninitialized_var(val); + const int bytes = (nrbits + b + 7) >> 3; ubifs_assert(nrbits > 0); ubifs_assert(nrbits <= 32); ubifs_assert(*pos >= 0); ubifs_assert(*pos < 8); if (b) { - val = p[1] | ((uint32_t)p[2] << 8) | ((uint32_t)p[3] << 16) | - ((uint32_t)p[4] << 24); + switch (bytes) { + case 2: + val = p[1]; + break; + case 3: + val = p[1] | ((uint32_t)p[2] << 8); + break; + case 4: + val = p[1] | ((uint32_t)p[2] << 8) | + ((uint32_t)p[3] << 16); + break; + case 5: + val = p[1] | ((uint32_t)p[2] << 8) | + ((uint32_t)p[3] << 16) | + ((uint32_t)p[4] << 24); + } val <<= (8 - b); val |= *p >> b; nrbits += b; - } else - val = p[0] | ((uint32_t)p[1] << 8) | ((uint32_t)p[2] << 16) | - ((uint32_t)p[3] << 24); + } else { + switch (bytes) { + case 1: + val = p[0]; + break; + case 2: + val = p[0] | ((uint32_t)p[1] << 8); + break; + case 3: + val = p[0] | ((uint32_t)p[1] << 8) | + ((uint32_t)p[2] << 16); + break; + case 4: + val = p[0] | ((uint32_t)p[1] << 8) | + ((uint32_t)p[2] << 16) | + ((uint32_t)p[3] << 24); + break; + } + } val <<= k; val >>= k; b = nrbits & 7; - p += nrbits / 8; + p += nrbits >> 3; *addr = p; *pos = b; ubifs_assert((val >> nrbits) == 0 || nrbits - b == 32); diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 5f0b83e20af6..eed5a0025d63 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -177,8 +177,6 @@ static int alloc_lpt_leb(struct ubifs_info *c, int *lnum) return 0; } } - dbg_err("last LEB %d", *lnum); - dump_stack(); return -ENOSPC; } @@ -193,6 +191,9 @@ static int layout_cnodes(struct ubifs_info *c) int lnum, offs, len, alen, done_lsave, done_ltab, err; struct ubifs_cnode *cnode; + err = dbg_chk_lpt_sz(c, 0, 0); + if (err) + return err; cnode = c->lpt_cnext; if (!cnode) return 0; @@ -206,6 +207,7 @@ static int layout_cnodes(struct ubifs_info *c) c->lsave_lnum = lnum; c->lsave_offs = offs; offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); } if (offs + c->ltab_sz <= c->leb_size) { @@ -213,6 +215,7 @@ static int layout_cnodes(struct ubifs_info *c) c->ltab_lnum = lnum; c->ltab_offs = offs; offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); } do { @@ -226,9 +229,10 @@ static int layout_cnodes(struct ubifs_info *c) while (offs + len > c->leb_size) { alen = ALIGN(offs, c->min_io_size); upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + dbg_chk_lpt_sz(c, 2, alen - offs); err = alloc_lpt_leb(c, &lnum); if (err) - return err; + goto no_space; offs = 0; ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); @@ -238,6 +242,7 @@ static int layout_cnodes(struct ubifs_info *c) c->lsave_lnum = lnum; c->lsave_offs = offs; offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); continue; } if (!done_ltab) { @@ -245,6 +250,7 @@ static int layout_cnodes(struct ubifs_info *c) c->ltab_lnum = lnum; c->ltab_offs = offs; offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); continue; } break; @@ -257,6 +263,7 @@ static int layout_cnodes(struct ubifs_info *c) c->lpt_offs = offs; } offs += len; + dbg_chk_lpt_sz(c, 1, len); cnode = cnode->cnext; } while (cnode && cnode != c->lpt_cnext); @@ -265,9 +272,10 @@ static int layout_cnodes(struct ubifs_info *c) if (offs + c->lsave_sz > c->leb_size) { alen = ALIGN(offs, c->min_io_size); upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + dbg_chk_lpt_sz(c, 2, alen - offs); err = alloc_lpt_leb(c, &lnum); if (err) - return err; + goto no_space; offs = 0; ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); @@ -276,6 +284,7 @@ static int layout_cnodes(struct ubifs_info *c) c->lsave_lnum = lnum; c->lsave_offs = offs; offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); } /* Make sure to place LPT's own lprops table */ @@ -283,9 +292,10 @@ static int layout_cnodes(struct ubifs_info *c) if (offs + c->ltab_sz > c->leb_size) { alen = ALIGN(offs, c->min_io_size); upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + dbg_chk_lpt_sz(c, 2, alen - offs); err = alloc_lpt_leb(c, &lnum); if (err) - return err; + goto no_space; offs = 0; ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); @@ -294,11 +304,23 @@ static int layout_cnodes(struct ubifs_info *c) c->ltab_lnum = lnum; c->ltab_offs = offs; offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); } alen = ALIGN(offs, c->min_io_size); upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + dbg_chk_lpt_sz(c, 4, alen - offs); + err = dbg_chk_lpt_sz(c, 3, alen); + if (err) + return err; return 0; + +no_space: + ubifs_err("LPT out of space"); + dbg_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, " + "done_lsave %d", lnum, offs, len, done_ltab, done_lsave); + dbg_dump_lpt_info(c); + return err; } /** @@ -333,8 +355,6 @@ static int realloc_lpt_leb(struct ubifs_info *c, int *lnum) *lnum = i + c->lpt_first; return 0; } - dbg_err("last LEB %d", *lnum); - dump_stack(); return -ENOSPC; } @@ -369,12 +389,14 @@ static int write_cnodes(struct ubifs_info *c) done_lsave = 1; ubifs_pack_lsave(c, buf + offs, c->lsave); offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); } if (offs + c->ltab_sz <= c->leb_size) { done_ltab = 1; ubifs_pack_ltab(c, buf + offs, c->ltab_cmt); offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); } /* Loop for each cnode */ @@ -392,10 +414,12 @@ static int write_cnodes(struct ubifs_info *c) alen, UBI_SHORTTERM); if (err) return err; + dbg_chk_lpt_sz(c, 4, alen - wlen); } + dbg_chk_lpt_sz(c, 2, 0); err = realloc_lpt_leb(c, &lnum); if (err) - return err; + goto no_space; offs = 0; from = 0; ubifs_assert(lnum >= c->lpt_first && @@ -408,12 +432,14 @@ static int write_cnodes(struct ubifs_info *c) done_lsave = 1; ubifs_pack_lsave(c, buf + offs, c->lsave); offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); continue; } if (!done_ltab) { done_ltab = 1; ubifs_pack_ltab(c, buf + offs, c->ltab_cmt); offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); continue; } break; @@ -435,6 +461,7 @@ static int write_cnodes(struct ubifs_info *c) clear_bit(COW_ZNODE, &cnode->flags); smp_mb__after_clear_bit(); offs += len; + dbg_chk_lpt_sz(c, 1, len); cnode = cnode->cnext; } while (cnode && cnode != c->lpt_cnext); @@ -448,9 +475,10 @@ static int write_cnodes(struct ubifs_info *c) UBI_SHORTTERM); if (err) return err; + dbg_chk_lpt_sz(c, 2, alen - wlen); err = realloc_lpt_leb(c, &lnum); if (err) - return err; + goto no_space; offs = 0; ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); @@ -461,6 +489,7 @@ static int write_cnodes(struct ubifs_info *c) done_lsave = 1; ubifs_pack_lsave(c, buf + offs, c->lsave); offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); } /* Make sure to place LPT's own lprops table */ @@ -473,9 +502,10 @@ static int write_cnodes(struct ubifs_info *c) UBI_SHORTTERM); if (err) return err; + dbg_chk_lpt_sz(c, 2, alen - wlen); err = realloc_lpt_leb(c, &lnum); if (err) - return err; + goto no_space; offs = 0; ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); @@ -486,6 +516,7 @@ static int write_cnodes(struct ubifs_info *c) done_ltab = 1; ubifs_pack_ltab(c, buf + offs, c->ltab_cmt); offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); } /* Write remaining data in buffer */ @@ -495,6 +526,12 @@ static int write_cnodes(struct ubifs_info *c) err = ubifs_leb_write(c, lnum, buf + from, from, alen, UBI_SHORTTERM); if (err) return err; + + dbg_chk_lpt_sz(c, 4, alen - wlen); + err = dbg_chk_lpt_sz(c, 3, ALIGN(offs, c->min_io_size)); + if (err) + return err; + c->nhead_lnum = lnum; c->nhead_offs = ALIGN(offs, c->min_io_size); @@ -503,7 +540,15 @@ static int write_cnodes(struct ubifs_info *c) dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs); if (c->big_lpt) dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs); + return 0; + +no_space: + ubifs_err("LPT out of space mismatch"); + dbg_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab " + "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave); + dbg_dump_lpt_info(c); + return err; } /** @@ -1044,6 +1089,8 @@ static int is_a_node(struct ubifs_info *c, uint8_t *buf, int len) int pos = 0, node_type, node_len; uint16_t crc, calc_crc; + if (len < UBIFS_LPT_CRC_BYTES + (UBIFS_LPT_TYPE_BITS + 7) / 8) + return 0; node_type = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_TYPE_BITS); if (node_type == UBIFS_LPT_NOT_A_NODE) return 0; @@ -1156,6 +1203,9 @@ int ubifs_lpt_start_commit(struct ubifs_info *c) dbg_lp(""); mutex_lock(&c->lp_mutex); + err = dbg_chk_lpt_free_spc(c); + if (err) + goto out; err = dbg_check_ltab(c); if (err) goto out; @@ -1645,4 +1695,121 @@ int dbg_check_ltab(struct ubifs_info *c) return 0; } +/** + * dbg_chk_lpt_free_spc - check LPT free space is enough to write entire LPT. + * @c: the UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +int dbg_chk_lpt_free_spc(struct ubifs_info *c) +{ + long long free = 0; + int i; + + for (i = 0; i < c->lpt_lebs; i++) { + if (c->ltab[i].tgc || c->ltab[i].cmt) + continue; + if (i + c->lpt_first == c->nhead_lnum) + free += c->leb_size - c->nhead_offs; + else if (c->ltab[i].free == c->leb_size) + free += c->leb_size; + } + if (free < c->lpt_sz) { + dbg_err("LPT space error: free %lld lpt_sz %lld", + free, c->lpt_sz); + dbg_dump_lpt_info(c); + return -EINVAL; + } + return 0; +} + +/** + * dbg_chk_lpt_sz - check LPT does not write more than LPT size. + * @c: the UBIFS file-system description object + * @action: action + * @len: length written + * + * This function returns %0 on success and a negative error code on failure. + */ +int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) +{ + long long chk_lpt_sz, lpt_sz; + int err = 0; + + switch (action) { + case 0: + c->chk_lpt_sz = 0; + c->chk_lpt_sz2 = 0; + c->chk_lpt_lebs = 0; + c->chk_lpt_wastage = 0; + if (c->dirty_pn_cnt > c->pnode_cnt) { + dbg_err("dirty pnodes %d exceed max %d", + c->dirty_pn_cnt, c->pnode_cnt); + err = -EINVAL; + } + if (c->dirty_nn_cnt > c->nnode_cnt) { + dbg_err("dirty nnodes %d exceed max %d", + c->dirty_nn_cnt, c->nnode_cnt); + err = -EINVAL; + } + return err; + case 1: + c->chk_lpt_sz += len; + return 0; + case 2: + c->chk_lpt_sz += len; + c->chk_lpt_wastage += len; + c->chk_lpt_lebs += 1; + return 0; + case 3: + chk_lpt_sz = c->leb_size; + chk_lpt_sz *= c->chk_lpt_lebs; + chk_lpt_sz += len - c->nhead_offs; + if (c->chk_lpt_sz != chk_lpt_sz) { + dbg_err("LPT wrote %lld but space used was %lld", + c->chk_lpt_sz, chk_lpt_sz); + err = -EINVAL; + } + if (c->chk_lpt_sz > c->lpt_sz) { + dbg_err("LPT wrote %lld but lpt_sz is %lld", + c->chk_lpt_sz, c->lpt_sz); + err = -EINVAL; + } + if (c->chk_lpt_sz2 && c->chk_lpt_sz != c->chk_lpt_sz2) { + dbg_err("LPT layout size %lld but wrote %lld", + c->chk_lpt_sz, c->chk_lpt_sz2); + err = -EINVAL; + } + if (c->chk_lpt_sz2 && c->new_nhead_offs != len) { + dbg_err("LPT new nhead offs: expected %d was %d", + c->new_nhead_offs, len); + err = -EINVAL; + } + lpt_sz = (long long)c->pnode_cnt * c->pnode_sz; + lpt_sz += (long long)c->nnode_cnt * c->nnode_sz; + lpt_sz += c->ltab_sz; + if (c->big_lpt) + lpt_sz += c->lsave_sz; + if (c->chk_lpt_sz - c->chk_lpt_wastage > lpt_sz) { + dbg_err("LPT chk_lpt_sz %lld + waste %lld exceeds %lld", + c->chk_lpt_sz, c->chk_lpt_wastage, lpt_sz); + err = -EINVAL; + } + if (err) + dbg_dump_lpt_info(c); + c->chk_lpt_sz2 = c->chk_lpt_sz; + c->chk_lpt_sz = 0; + c->chk_lpt_wastage = 0; + c->chk_lpt_lebs = 0; + c->new_nhead_offs = len; + return err; + case 4: + c->chk_lpt_sz += len; + c->chk_lpt_wastage += len; + return 0; + default: + return -EINVAL; + } +} + #endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h index 4c12a9215d7f..4fa81d867e41 100644 --- a/fs/ubifs/misc.h +++ b/fs/ubifs/misc.h @@ -310,4 +310,31 @@ static inline int ubifs_tnc_lookup(struct ubifs_info *c, return ubifs_tnc_locate(c, key, node, NULL, NULL); } +/** + * ubifs_get_lprops - get reference to LEB properties. + * @c: the UBIFS file-system description object + * + * This function locks lprops. Lprops have to be unlocked by + * 'ubifs_release_lprops()'. + */ +static inline void ubifs_get_lprops(struct ubifs_info *c) +{ + mutex_lock(&c->lp_mutex); +} + +/** + * ubifs_release_lprops - release lprops lock. + * @c: the UBIFS file-system description object + * + * This function has to be called after each 'ubifs_get_lprops()' call to + * unlock lprops. + */ +static inline void ubifs_release_lprops(struct ubifs_info *c) +{ + ubifs_assert(mutex_is_locked(&c->lp_mutex)); + ubifs_assert(c->lst.empty_lebs >= 0 && + c->lst.empty_lebs <= c->main_lebs); + mutex_unlock(&c->lp_mutex); +} + #endif /* __UBIFS_MISC_H__ */ diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c index acf5c5fffc60..0ed82479b44b 100644 --- a/fs/ubifs/scan.c +++ b/fs/ubifs/scan.c @@ -87,7 +87,7 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, dbg_scan("scanning %s", dbg_ntype(ch->node_type)); - if (ubifs_check_node(c, buf, lnum, offs, quiet)) + if (ubifs_check_node(c, buf, lnum, offs, quiet, 1)) return SCANNED_A_CORRUPT_NODE; if (ch->node_type == UBIFS_PAD_NODE) { diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 9a9220333b3b..8780efbf40ac 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -401,6 +401,16 @@ static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt) else if (c->mount_opts.unmount_mode == 1) seq_printf(s, ",norm_unmount"); + if (c->mount_opts.bulk_read == 2) + seq_printf(s, ",bulk_read"); + else if (c->mount_opts.bulk_read == 1) + seq_printf(s, ",no_bulk_read"); + + if (c->mount_opts.chk_data_crc == 2) + seq_printf(s, ",chk_data_crc"); + else if (c->mount_opts.chk_data_crc == 1) + seq_printf(s, ",no_chk_data_crc"); + return 0; } @@ -408,13 +418,26 @@ static int ubifs_sync_fs(struct super_block *sb, int wait) { struct ubifs_info *c = sb->s_fs_info; int i, ret = 0, err; + long long bud_bytes; - if (c->jheads) + if (c->jheads) { for (i = 0; i < c->jhead_cnt; i++) { err = ubifs_wbuf_sync(&c->jheads[i].wbuf); if (err && !ret) ret = err; } + + /* Commit the journal unless it has too little data */ + spin_lock(&c->buds_lock); + bud_bytes = c->bud_bytes; + spin_unlock(&c->buds_lock); + if (bud_bytes > c->leb_size) { + err = ubifs_run_commit(c); + if (err) + return err; + } + } + /* * We ought to call sync for c->ubi but it does not have one. If it had * it would in turn call mtd->sync, however mtd operations are @@ -538,6 +561,18 @@ static int init_constants_early(struct ubifs_info *c) * calculations when reporting free space. */ c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ; + /* Buffer size for bulk-reads */ + c->bulk_read_buf_size = UBIFS_MAX_BULK_READ * UBIFS_MAX_DATA_NODE_SZ; + if (c->bulk_read_buf_size > c->leb_size) + c->bulk_read_buf_size = c->leb_size; + if (c->bulk_read_buf_size > 128 * 1024) { + /* Check if we can kmalloc more than 128KiB */ + void *try = kmalloc(c->bulk_read_buf_size, GFP_KERNEL); + + kfree(try); + if (!try) + c->bulk_read_buf_size = 128 * 1024; + } return 0; } @@ -840,17 +875,29 @@ static int check_volume_empty(struct ubifs_info *c) * * Opt_fast_unmount: do not run a journal commit before un-mounting * Opt_norm_unmount: run a journal commit before un-mounting + * Opt_bulk_read: enable bulk-reads + * Opt_no_bulk_read: disable bulk-reads + * Opt_chk_data_crc: check CRCs when reading data nodes + * Opt_no_chk_data_crc: do not check CRCs when reading data nodes * Opt_err: just end of array marker */ enum { Opt_fast_unmount, Opt_norm_unmount, + Opt_bulk_read, + Opt_no_bulk_read, + Opt_chk_data_crc, + Opt_no_chk_data_crc, Opt_err, }; static const match_table_t tokens = { {Opt_fast_unmount, "fast_unmount"}, {Opt_norm_unmount, "norm_unmount"}, + {Opt_bulk_read, "bulk_read"}, + {Opt_no_bulk_read, "no_bulk_read"}, + {Opt_chk_data_crc, "chk_data_crc"}, + {Opt_no_chk_data_crc, "no_chk_data_crc"}, {Opt_err, NULL}, }; @@ -888,6 +935,22 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options, c->mount_opts.unmount_mode = 1; c->fast_unmount = 0; break; + case Opt_bulk_read: + c->mount_opts.bulk_read = 2; + c->bulk_read = 1; + break; + case Opt_no_bulk_read: + c->mount_opts.bulk_read = 1; + c->bulk_read = 0; + break; + case Opt_chk_data_crc: + c->mount_opts.chk_data_crc = 2; + c->no_chk_data_crc = 0; + break; + case Opt_no_chk_data_crc: + c->mount_opts.chk_data_crc = 1; + c->no_chk_data_crc = 1; + break; default: ubifs_err("unrecognized mount option \"%s\" " "or missing value", p); @@ -996,6 +1059,8 @@ static int mount_ubifs(struct ubifs_info *c) goto out_free; } + c->always_chk_crc = 1; + err = ubifs_read_superblock(c); if (err) goto out_free; @@ -1032,8 +1097,6 @@ static int mount_ubifs(struct ubifs_info *c) /* Create background thread */ c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); - if (!c->bgt) - c->bgt = ERR_PTR(-EINVAL); if (IS_ERR(c->bgt)) { err = PTR_ERR(c->bgt); c->bgt = NULL; @@ -1139,24 +1202,28 @@ static int mount_ubifs(struct ubifs_info *c) if (err) goto out_infos; + c->always_chk_crc = 0; + ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"", c->vi.ubi_num, c->vi.vol_id, c->vi.name); if (mounted_read_only) ubifs_msg("mounted read-only"); x = (long long)c->main_lebs * c->leb_size; - ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d LEBs)", - x, x >> 10, x >> 20, c->main_lebs); + ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d " + "LEBs)", x, x >> 10, x >> 20, c->main_lebs); x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; - ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d LEBs)", - x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt); - ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr)); - ubifs_msg("media format %d, latest format %d", + ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d " + "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt); + ubifs_msg("media format: %d (latest is %d)", c->fmt_version, UBIFS_FORMAT_VERSION); + ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr)); + ubifs_msg("reserved for root: %llu bytes (%llu KiB)", + c->report_rp_size, c->report_rp_size >> 10); dbg_msg("compiled on: " __DATE__ " at " __TIME__); dbg_msg("min. I/O unit size: %d bytes", c->min_io_size); dbg_msg("LEB size: %d bytes (%d KiB)", - c->leb_size, c->leb_size / 1024); + c->leb_size, c->leb_size >> 10); dbg_msg("data journal heads: %d", c->jhead_cnt - NONDATA_JHEADS_CNT); dbg_msg("UUID: %02X%02X%02X%02X-%02X%02X" @@ -1282,6 +1349,7 @@ static int ubifs_remount_rw(struct ubifs_info *c) mutex_lock(&c->umount_mutex); c->remounting_rw = 1; + c->always_chk_crc = 1; /* Check for enough free space */ if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) { @@ -1345,20 +1413,20 @@ static int ubifs_remount_rw(struct ubifs_info *c) /* Create background thread */ c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); - if (!c->bgt) - c->bgt = ERR_PTR(-EINVAL); if (IS_ERR(c->bgt)) { err = PTR_ERR(c->bgt); c->bgt = NULL; ubifs_err("cannot spawn \"%s\", error %d", c->bgt_name, err); - return err; + goto out; } wake_up_process(c->bgt); c->orph_buf = vmalloc(c->leb_size); - if (!c->orph_buf) - return -ENOMEM; + if (!c->orph_buf) { + err = -ENOMEM; + goto out; + } /* Check for enough log space */ lnum = c->lhead_lnum + 1; @@ -1385,6 +1453,7 @@ static int ubifs_remount_rw(struct ubifs_info *c) dbg_gen("re-mounted read-write"); c->vfs_sb->s_flags &= ~MS_RDONLY; c->remounting_rw = 0; + c->always_chk_crc = 0; mutex_unlock(&c->umount_mutex); return 0; @@ -1400,6 +1469,7 @@ out: c->ileb_buf = NULL; ubifs_lpt_free(c, 1); c->remounting_rw = 0; + c->always_chk_crc = 0; mutex_unlock(&c->umount_mutex); return err; } @@ -1408,12 +1478,9 @@ out: * commit_on_unmount - commit the journal when un-mounting. * @c: UBIFS file-system description object * - * This function is called during un-mounting and it commits the journal unless - * the "fast unmount" mode is enabled. It also avoids committing the journal if - * it contains too few data. - * - * Sometimes recovery requires the journal to be committed at least once, and - * this function takes care about this. + * This function is called during un-mounting and re-mounting, and it commits + * the journal unless the "fast unmount" mode is enabled. It also avoids + * committing the journal if it contains too few data. */ static void commit_on_unmount(struct ubifs_info *c) { diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 7634c5970887..d27fd918b9c9 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -284,7 +284,7 @@ static struct ubifs_znode *dirty_cow_znode(struct ubifs_info *c, } zn = copy_znode(c, znode); - if (unlikely(IS_ERR(zn))) + if (IS_ERR(zn)) return zn; if (zbr->len) { @@ -470,6 +470,10 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type, if (node_len != len) return 0; + if (type == UBIFS_DATA_NODE && !c->always_chk_crc) + if (c->no_chk_data_crc) + return 0; + crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); node_crc = le32_to_cpu(ch->crc); if (crc != node_crc) @@ -1128,7 +1132,7 @@ static struct ubifs_znode *dirty_cow_bottom_up(struct ubifs_info *c, ubifs_assert(znode == c->zroot.znode); znode = dirty_cow_znode(c, &c->zroot); } - if (unlikely(IS_ERR(znode)) || !p) + if (IS_ERR(znode) || !p) break; ubifs_assert(path[p - 1] >= 0); ubifs_assert(path[p - 1] < znode->child_cnt); @@ -1492,6 +1496,289 @@ out: } /** + * ubifs_tnc_get_bu_keys - lookup keys for bulk-read. + * @c: UBIFS file-system description object + * @bu: bulk-read parameters and results + * + * Lookup consecutive data node keys for the same inode that reside + * consecutively in the same LEB. + */ +int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu) +{ + int n, err = 0, lnum = -1, uninitialized_var(offs); + int uninitialized_var(len); + unsigned int block = key_block(c, &bu->key); + struct ubifs_znode *znode; + + bu->cnt = 0; + bu->blk_cnt = 0; + bu->eof = 0; + + mutex_lock(&c->tnc_mutex); + /* Find first key */ + err = ubifs_lookup_level0(c, &bu->key, &znode, &n); + if (err < 0) + goto out; + if (err) { + /* Key found */ + len = znode->zbranch[n].len; + /* The buffer must be big enough for at least 1 node */ + if (len > bu->buf_len) { + err = -EINVAL; + goto out; + } + /* Add this key */ + bu->zbranch[bu->cnt++] = znode->zbranch[n]; + bu->blk_cnt += 1; + lnum = znode->zbranch[n].lnum; + offs = ALIGN(znode->zbranch[n].offs + len, 8); + } + while (1) { + struct ubifs_zbranch *zbr; + union ubifs_key *key; + unsigned int next_block; + + /* Find next key */ + err = tnc_next(c, &znode, &n); + if (err) + goto out; + zbr = &znode->zbranch[n]; + key = &zbr->key; + /* See if there is another data key for this file */ + if (key_inum(c, key) != key_inum(c, &bu->key) || + key_type(c, key) != UBIFS_DATA_KEY) { + err = -ENOENT; + goto out; + } + if (lnum < 0) { + /* First key found */ + lnum = zbr->lnum; + offs = ALIGN(zbr->offs + zbr->len, 8); + len = zbr->len; + if (len > bu->buf_len) { + err = -EINVAL; + goto out; + } + } else { + /* + * The data nodes must be in consecutive positions in + * the same LEB. + */ + if (zbr->lnum != lnum || zbr->offs != offs) + goto out; + offs += ALIGN(zbr->len, 8); + len = ALIGN(len, 8) + zbr->len; + /* Must not exceed buffer length */ + if (len > bu->buf_len) + goto out; + } + /* Allow for holes */ + next_block = key_block(c, key); + bu->blk_cnt += (next_block - block - 1); + if (bu->blk_cnt >= UBIFS_MAX_BULK_READ) + goto out; + block = next_block; + /* Add this key */ + bu->zbranch[bu->cnt++] = *zbr; + bu->blk_cnt += 1; + /* See if we have room for more */ + if (bu->cnt >= UBIFS_MAX_BULK_READ) + goto out; + if (bu->blk_cnt >= UBIFS_MAX_BULK_READ) + goto out; + } +out: + if (err == -ENOENT) { + bu->eof = 1; + err = 0; + } + bu->gc_seq = c->gc_seq; + mutex_unlock(&c->tnc_mutex); + if (err) + return err; + /* + * An enormous hole could cause bulk-read to encompass too many + * page cache pages, so limit the number here. + */ + if (bu->blk_cnt > UBIFS_MAX_BULK_READ) + bu->blk_cnt = UBIFS_MAX_BULK_READ; + /* + * Ensure that bulk-read covers a whole number of page cache + * pages. + */ + if (UBIFS_BLOCKS_PER_PAGE == 1 || + !(bu->blk_cnt & (UBIFS_BLOCKS_PER_PAGE - 1))) + return 0; + if (bu->eof) { + /* At the end of file we can round up */ + bu->blk_cnt += UBIFS_BLOCKS_PER_PAGE - 1; + return 0; + } + /* Exclude data nodes that do not make up a whole page cache page */ + block = key_block(c, &bu->key) + bu->blk_cnt; + block &= ~(UBIFS_BLOCKS_PER_PAGE - 1); + while (bu->cnt) { + if (key_block(c, &bu->zbranch[bu->cnt - 1].key) < block) + break; + bu->cnt -= 1; + } + return 0; +} + +/** + * read_wbuf - bulk-read from a LEB with a wbuf. + * @wbuf: wbuf that may overlap the read + * @buf: buffer into which to read + * @len: read length + * @lnum: LEB number from which to read + * @offs: offset from which to read + * + * This functions returns %0 on success or a negative error code on failure. + */ +static int read_wbuf(struct ubifs_wbuf *wbuf, void *buf, int len, int lnum, + int offs) +{ + const struct ubifs_info *c = wbuf->c; + int rlen, overlap; + + dbg_io("LEB %d:%d, length %d", lnum, offs, len); + ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0); + ubifs_assert(!(offs & 7) && offs < c->leb_size); + ubifs_assert(offs + len <= c->leb_size); + + spin_lock(&wbuf->lock); + overlap = (lnum == wbuf->lnum && offs + len > wbuf->offs); + if (!overlap) { + /* We may safely unlock the write-buffer and read the data */ + spin_unlock(&wbuf->lock); + return ubi_read(c->ubi, lnum, buf, offs, len); + } + + /* Don't read under wbuf */ + rlen = wbuf->offs - offs; + if (rlen < 0) + rlen = 0; + + /* Copy the rest from the write-buffer */ + memcpy(buf + rlen, wbuf->buf + offs + rlen - wbuf->offs, len - rlen); + spin_unlock(&wbuf->lock); + + if (rlen > 0) + /* Read everything that goes before write-buffer */ + return ubi_read(c->ubi, lnum, buf, offs, rlen); + + return 0; +} + +/** + * validate_data_node - validate data nodes for bulk-read. + * @c: UBIFS file-system description object + * @buf: buffer containing data node to validate + * @zbr: zbranch of data node to validate + * + * This functions returns %0 on success or a negative error code on failure. + */ +static int validate_data_node(struct ubifs_info *c, void *buf, + struct ubifs_zbranch *zbr) +{ + union ubifs_key key1; + struct ubifs_ch *ch = buf; + int err, len; + + if (ch->node_type != UBIFS_DATA_NODE) { + ubifs_err("bad node type (%d but expected %d)", + ch->node_type, UBIFS_DATA_NODE); + goto out_err; + } + + err = ubifs_check_node(c, buf, zbr->lnum, zbr->offs, 0, 0); + if (err) { + ubifs_err("expected node type %d", UBIFS_DATA_NODE); + goto out; + } + + len = le32_to_cpu(ch->len); + if (len != zbr->len) { + ubifs_err("bad node length %d, expected %d", len, zbr->len); + goto out_err; + } + + /* Make sure the key of the read node is correct */ + key_read(c, buf + UBIFS_KEY_OFFSET, &key1); + if (!keys_eq(c, &zbr->key, &key1)) { + ubifs_err("bad key in node at LEB %d:%d", + zbr->lnum, zbr->offs); + dbg_tnc("looked for key %s found node's key %s", + DBGKEY(&zbr->key), DBGKEY1(&key1)); + goto out_err; + } + + return 0; + +out_err: + err = -EINVAL; +out: + ubifs_err("bad node at LEB %d:%d", zbr->lnum, zbr->offs); + dbg_dump_node(c, buf); + dbg_dump_stack(); + return err; +} + +/** + * ubifs_tnc_bulk_read - read a number of data nodes in one go. + * @c: UBIFS file-system description object + * @bu: bulk-read parameters and results + * + * This functions reads and validates the data nodes that were identified by the + * 'ubifs_tnc_get_bu_keys()' function. This functions returns %0 on success, + * -EAGAIN to indicate a race with GC, or another negative error code on + * failure. + */ +int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu) +{ + int lnum = bu->zbranch[0].lnum, offs = bu->zbranch[0].offs, len, err, i; + struct ubifs_wbuf *wbuf; + void *buf; + + len = bu->zbranch[bu->cnt - 1].offs; + len += bu->zbranch[bu->cnt - 1].len - offs; + if (len > bu->buf_len) { + ubifs_err("buffer too small %d vs %d", bu->buf_len, len); + return -EINVAL; + } + + /* Do the read */ + wbuf = ubifs_get_wbuf(c, lnum); + if (wbuf) + err = read_wbuf(wbuf, bu->buf, len, lnum, offs); + else + err = ubi_read(c->ubi, lnum, bu->buf, offs, len); + + /* Check for a race with GC */ + if (maybe_leb_gced(c, lnum, bu->gc_seq)) + return -EAGAIN; + + if (err && err != -EBADMSG) { + ubifs_err("failed to read from LEB %d:%d, error %d", + lnum, offs, err); + dbg_dump_stack(); + dbg_tnc("key %s", DBGKEY(&bu->key)); + return err; + } + + /* Validate the nodes read */ + buf = bu->buf; + for (i = 0; i < bu->cnt; i++) { + err = validate_data_node(c, buf, &bu->zbranch[i]); + if (err) + return err; + buf = buf + ALIGN(bu->zbranch[i].len, 8); + } + + return 0; +} + +/** * do_lookup_nm- look up a "hashed" node. * @c: UBIFS file-system description object * @key: node key to lookup @@ -1675,7 +1962,7 @@ static int tnc_insert(struct ubifs_info *c, struct ubifs_znode *znode, { struct ubifs_znode *zn, *zi, *zp; int i, keep, move, appending = 0; - union ubifs_key *key = &zbr->key; + union ubifs_key *key = &zbr->key, *key1; ubifs_assert(n >= 0 && n <= c->fanout); @@ -1716,20 +2003,33 @@ again: zn->level = znode->level; /* Decide where to split */ - if (znode->level == 0 && n == c->fanout && - key_type(c, key) == UBIFS_DATA_KEY) { - union ubifs_key *key1; - - /* - * If this is an inode which is being appended - do not split - * it because no other zbranches can be inserted between - * zbranches of consecutive data nodes anyway. - */ - key1 = &znode->zbranch[n - 1].key; - if (key_inum(c, key1) == key_inum(c, key) && - key_type(c, key1) == UBIFS_DATA_KEY && - key_block(c, key1) == key_block(c, key) - 1) - appending = 1; + if (znode->level == 0 && key_type(c, key) == UBIFS_DATA_KEY) { + /* Try not to split consecutive data keys */ + if (n == c->fanout) { + key1 = &znode->zbranch[n - 1].key; + if (key_inum(c, key1) == key_inum(c, key) && + key_type(c, key1) == UBIFS_DATA_KEY) + appending = 1; + } else + goto check_split; + } else if (appending && n != c->fanout) { + /* Try not to split consecutive data keys */ + appending = 0; +check_split: + if (n >= (c->fanout + 1) / 2) { + key1 = &znode->zbranch[0].key; + if (key_inum(c, key1) == key_inum(c, key) && + key_type(c, key1) == UBIFS_DATA_KEY) { + key1 = &znode->zbranch[n].key; + if (key_inum(c, key1) != key_inum(c, key) || + key_type(c, key1) != UBIFS_DATA_KEY) { + keep = n; + move = c->fanout - keep; + zi = znode; + goto do_split; + } + } + } } if (appending) { @@ -1759,6 +2059,8 @@ again: zbr->znode->parent = zn; } +do_split: + __set_bit(DIRTY_ZNODE, &zn->flags); atomic_long_inc(&c->dirty_zn_cnt); @@ -1785,14 +2087,11 @@ again: /* Insert new znode (produced by spitting) into the parent */ if (zp) { - i = n; + if (n == 0 && zi == znode && znode->iip == 0) + correct_parent_keys(c, znode); + /* Locate insertion point */ n = znode->iip + 1; - if (appending && n != c->fanout) - appending = 0; - - if (i == 0 && zi == znode && znode->iip == 0) - correct_parent_keys(c, znode); /* Tail recursion */ zbr->key = zn->zbranch[0].key; diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c index a25c1cc1f8d9..b48db999903e 100644 --- a/fs/ubifs/tnc_misc.c +++ b/fs/ubifs/tnc_misc.c @@ -480,8 +480,8 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr, } /* Make sure the key of the read node is correct */ - key_read(c, key, &key1); - if (memcmp(node + UBIFS_KEY_OFFSET, &key1, c->key_len)) { + key_read(c, node + UBIFS_KEY_OFFSET, &key1); + if (!keys_eq(c, key, &key1)) { ubifs_err("bad key in node at LEB %d:%d", zbr->lnum, zbr->offs); dbg_tnc("looked for key %s found node's key %s", diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h index a9ecbd9af20d..0b378042a3a2 100644 --- a/fs/ubifs/ubifs-media.h +++ b/fs/ubifs/ubifs-media.h @@ -75,7 +75,6 @@ */ #define UBIFS_BLOCK_SIZE 4096 #define UBIFS_BLOCK_SHIFT 12 -#define UBIFS_BLOCK_MASK 0x00000FFF /* UBIFS padding byte pattern (must not be first or last byte of node magic) */ #define UBIFS_PADDING_BYTE 0xCE diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 17c620b93eec..a7bd32fa15b9 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -142,6 +142,9 @@ /* Maximum expected tree height for use by bottom_up_buf */ #define BOTTOM_UP_HEIGHT 64 +/* Maximum number of data nodes to bulk-read */ +#define UBIFS_MAX_BULK_READ 32 + /* * Lockdep classes for UBIFS inode @ui_mutex. */ @@ -328,9 +331,10 @@ struct ubifs_gced_idx_leb { * this inode * @dirty: non-zero if the inode is dirty * @xattr: non-zero if this is an extended attribute inode + * @bulk_read: non-zero if bulk-read should be used * @ui_mutex: serializes inode write-back with the rest of VFS operations, - * serializes "clean <-> dirty" state changes, protects @dirty, - * @ui_size, and @xattr_size + * serializes "clean <-> dirty" state changes, serializes bulk-read, + * protects @dirty, @bulk_read, @ui_size, and @xattr_size * @ui_lock: protects @synced_i_size * @synced_i_size: synchronized size of inode, i.e. the value of inode size * currently stored on the flash; used only for regular file @@ -338,6 +342,8 @@ struct ubifs_gced_idx_leb { * @ui_size: inode size used by UBIFS when writing to flash * @flags: inode flags (@UBIFS_COMPR_FL, etc) * @compr_type: default compression type used for this inode + * @last_page_read: page number of last page read (for bulk read) + * @read_in_a_row: number of consecutive pages read in a row (for bulk read) * @data_len: length of the data attached to the inode * @data: inode's data * @@ -379,12 +385,15 @@ struct ubifs_inode { unsigned int xattr_names; unsigned int dirty:1; unsigned int xattr:1; + unsigned int bulk_read:1; struct mutex ui_mutex; spinlock_t ui_lock; loff_t synced_i_size; loff_t ui_size; int flags; int compr_type; + pgoff_t last_page_read; + pgoff_t read_in_a_row; int data_len; void *data; }; @@ -698,8 +707,8 @@ struct ubifs_jhead { * struct ubifs_zbranch - key/coordinate/length branch stored in znodes. * @key: key * @znode: znode address in memory - * @lnum: LEB number of the indexing node - * @offs: offset of the indexing node within @lnum + * @lnum: LEB number of the target node (indexing node or data node) + * @offs: target node offset within @lnum * @len: target node length */ struct ubifs_zbranch { @@ -744,6 +753,28 @@ struct ubifs_znode { }; /** + * struct bu_info - bulk-read information + * @key: first data node key + * @zbranch: zbranches of data nodes to bulk read + * @buf: buffer to read into + * @buf_len: buffer length + * @gc_seq: GC sequence number to detect races with GC + * @cnt: number of data nodes for bulk read + * @blk_cnt: number of data blocks including holes + * @oef: end of file reached + */ +struct bu_info { + union ubifs_key key; + struct ubifs_zbranch zbranch[UBIFS_MAX_BULK_READ]; + void *buf; + int buf_len; + int gc_seq; + int cnt; + int blk_cnt; + int eof; +}; + +/** * struct ubifs_node_range - node length range description data structure. * @len: fixed node length * @min_len: minimum possible node length @@ -862,9 +893,13 @@ struct ubifs_orphan { /** * struct ubifs_mount_opts - UBIFS-specific mount options information. * @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast) + * @bulk_read: enable bulk-reads + * @chk_data_crc: check CRCs when reading data nodes */ struct ubifs_mount_opts { unsigned int unmount_mode:2; + unsigned int bulk_read:2; + unsigned int chk_data_crc:2; }; /** @@ -905,13 +940,12 @@ struct ubifs_mount_opts { * @cmt_state: commit state * @cs_lock: commit state lock * @cmt_wq: wait queue to sleep on if the log is full and a commit is running + * * @fast_unmount: do not run journal commit before un-mounting * @big_lpt: flag that LPT is too big to write whole during commit - * @check_lpt_free: flag that indicates LPT GC may be needed - * @nospace: non-zero if the file-system does not have flash space (used as - * optimization) - * @nospace_rp: the same as @nospace, but additionally means that even reserved - * pool is full + * @no_chk_data_crc: do not check CRCs when reading data nodes (except during + * recovery) + * @bulk_read: enable bulk-reads * * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and * @calc_idx_sz @@ -935,6 +969,7 @@ struct ubifs_mount_opts { * @mst_node: master node * @mst_offs: offset of valid master node * @mst_mutex: protects the master node area, @mst_node, and @mst_offs + * @bulk_read_buf_size: buffer size for bulk-reads * * @log_lebs: number of logical eraseblocks in the log * @log_bytes: log size in bytes @@ -977,12 +1012,17 @@ struct ubifs_mount_opts { * but which still have to be taken into account because * the index has not been committed so far * @space_lock: protects @budg_idx_growth, @budg_data_growth, @budg_dd_growth, - * @budg_uncommited_idx, @min_idx_lebs, @old_idx_sz, and @lst; + * @budg_uncommited_idx, @min_idx_lebs, @old_idx_sz, @lst, + * @nospace, and @nospace_rp; * @min_idx_lebs: minimum number of LEBs required for the index * @old_idx_sz: size of index on flash * @calc_idx_sz: temporary variable which is used to calculate new index size * (contains accurate new index size at end of TNC commit start) * @lst: lprops statistics + * @nospace: non-zero if the file-system does not have flash space (used as + * optimization) + * @nospace_rp: the same as @nospace, but additionally means that even reserved + * pool is full * * @page_budget: budget for a page * @inode_budget: budget for an inode @@ -1061,6 +1101,7 @@ struct ubifs_mount_opts { * @lpt_drty_flgs: dirty flags for LPT special nodes e.g. ltab * @dirty_nn_cnt: number of dirty nnodes * @dirty_pn_cnt: number of dirty pnodes + * @check_lpt_free: flag that indicates LPT GC may be needed * @lpt_sz: LPT size * @lpt_nod_buf: buffer for an on-flash nnode or pnode * @lpt_buf: buffer of LEB size used by LPT @@ -1102,6 +1143,7 @@ struct ubifs_mount_opts { * @rcvrd_mst_node: recovered master node to write when mounting ro to rw * @size_tree: inode size information for recovery * @remounting_rw: set while remounting from ro to rw (sb flags have MS_RDONLY) + * @always_chk_crc: always check CRCs (while mounting and remounting rw) * @mount_opts: UBIFS-specific mount options * * @dbg_buf: a buffer of LEB size used for debugging purposes @@ -1146,11 +1188,11 @@ struct ubifs_info { int cmt_state; spinlock_t cs_lock; wait_queue_head_t cmt_wq; + unsigned int fast_unmount:1; unsigned int big_lpt:1; - unsigned int check_lpt_free:1; - unsigned int nospace:1; - unsigned int nospace_rp:1; + unsigned int no_chk_data_crc:1; + unsigned int bulk_read:1; struct mutex tnc_mutex; struct ubifs_zbranch zroot; @@ -1175,6 +1217,7 @@ struct ubifs_info { struct ubifs_mst_node *mst_node; int mst_offs; struct mutex mst_mutex; + int bulk_read_buf_size; int log_lebs; long long log_bytes; @@ -1218,6 +1261,8 @@ struct ubifs_info { unsigned long long old_idx_sz; unsigned long long calc_idx_sz; struct ubifs_lp_stats lst; + unsigned int nospace:1; + unsigned int nospace_rp:1; int page_budget; int inode_budget; @@ -1294,6 +1339,7 @@ struct ubifs_info { int lpt_drty_flgs; int dirty_nn_cnt; int dirty_pn_cnt; + int check_lpt_free; long long lpt_sz; void *lpt_nod_buf; void *lpt_buf; @@ -1335,6 +1381,7 @@ struct ubifs_info { struct ubifs_mst_node *rcvrd_mst_node; struct rb_root size_tree; int remounting_rw; + int always_chk_crc; struct ubifs_mount_opts mount_opts; #ifdef CONFIG_UBIFS_FS_DEBUG @@ -1347,6 +1394,12 @@ struct ubifs_info { unsigned long fail_timeout; unsigned int fail_cnt; unsigned int fail_cnt_max; + long long chk_lpt_sz; + long long chk_lpt_sz2; + long long chk_lpt_wastage; + int chk_lpt_lebs; + int new_nhead_lnum; + int new_nhead_offs; #endif }; @@ -1377,7 +1430,7 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum, int offs, int dtype); int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, - int offs, int quiet); + int offs, int quiet, int chk_crc); void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad); void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last); int ubifs_io_init(struct ubifs_info *c); @@ -1490,6 +1543,8 @@ void destroy_old_idx(struct ubifs_info *c); int is_idx_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, int level, int lnum, int offs); int insert_old_idx_znode(struct ubifs_info *c, struct ubifs_znode *znode); +int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu); +int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu); /* tnc_misc.c */ struct ubifs_znode *ubifs_tnc_levelorder_next(struct ubifs_znode *zr, @@ -1586,12 +1641,10 @@ int ubifs_lpt_post_commit(struct ubifs_info *c); void ubifs_lpt_free(struct ubifs_info *c, int wr_only); /* lprops.c */ -void ubifs_get_lprops(struct ubifs_info *c); const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, const struct ubifs_lprops *lp, int free, int dirty, int flags, int idx_gc_cnt); -void ubifs_release_lprops(struct ubifs_info *c); void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *stats); void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops, int cat); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 649bec78b645..cfd31e229c89 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -446,7 +446,7 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) int type; xent = ubifs_tnc_next_ent(c, &key, &nm); - if (unlikely(IS_ERR(xent))) { + if (IS_ERR(xent)) { err = PTR_ERR(xent); break; } |