From 04de0816173c86948b75da93a6344a0a02bbec4d Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Tue, 20 Apr 2010 14:49:01 +0200 Subject: pcmcia: pcmcia_dev_present bugfix pcmcia_dev_present is in and by itself buggy. Add a note specifying why it is broken, and replace the broken locking -- taking a mutex is a bad idea in IRQ context, from which this function is rarely called -- by an atomic_t. Signed-off-by: Dominik Brodowski --- include/pcmcia/ds.h | 7 +++---- include/pcmcia/ss.h | 8 +++----- 2 files changed, 6 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/pcmcia/ds.h b/include/pcmcia/ds.h index d57847f2f6c1..aab3c13dc310 100644 --- a/include/pcmcia/ds.h +++ b/include/pcmcia/ds.h @@ -26,6 +26,7 @@ #ifdef __KERNEL__ #include #include +#include /* * PCMCIA device drivers (16-bit cards only; 32-bit cards require CardBus @@ -94,10 +95,8 @@ struct pcmcia_device { config_req_t conf; window_handle_t win; - /* Is the device suspended, or in the process of - * being removed? */ + /* Is the device suspended? */ u16 suspended:1; - u16 _removed:1; /* Flags whether io, irq, win configurations were * requested, and whether the configuration is "locked" */ @@ -115,7 +114,7 @@ struct pcmcia_device { u16 has_card_id:1; u16 has_func_id:1; - u16 reserved:3; + u16 reserved:4; u8 func_id; u16 manf_id; diff --git a/include/pcmcia/ss.h b/include/pcmcia/ss.h index 2e488b60bc76..344705cb42f4 100644 --- a/include/pcmcia/ss.h +++ b/include/pcmcia/ss.h @@ -224,18 +224,16 @@ struct pcmcia_socket { /* 16-bit state: */ struct { - /* PCMCIA card is present in socket */ - u8 present:1; /* "master" ioctl is used */ u8 busy:1; - /* pcmcia module is being unloaded */ - u8 dead:1; /* the PCMCIA card consists of two pseudo devices */ u8 has_pfc:1; - u8 reserved:4; + u8 reserved:6; } pcmcia_state; + /* non-zero if PCMCIA card is present */ + atomic_t present; #ifdef CONFIG_PCMCIA_IOCTL struct user_info_t *user; -- cgit v1.2.3 From c3c532061e46156e8aab1268f38d66cfb63aeb2d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 22 Apr 2010 11:37:01 +0200 Subject: bdi: add helper function for doing init and register of a bdi for a file system Pretty trivial helper, just sets up the bdi and registers it. An atomic sequence count is used to ensure that the registered sysfs names are unique. Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 1 + mm/backing-dev.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index fcbc26af00e4..e19c677f219c 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -101,6 +101,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...); int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); void bdi_unregister(struct backing_dev_info *bdi); +int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int); void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, long nr_pages); int bdi_writeback_task(struct bdi_writeback *wb); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f13e067e1467..dbda4707f593 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -11,6 +11,8 @@ #include #include +static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); + void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) { } @@ -715,6 +717,33 @@ void bdi_destroy(struct backing_dev_info *bdi) } EXPORT_SYMBOL(bdi_destroy); +/* + * For use from filesystems to quickly init and register a bdi associated + * with dirty writeback + */ +int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, + unsigned int cap) +{ + char tmp[32]; + int err; + + bdi->name = name; + bdi->capabilities = cap; + err = bdi_init(bdi); + if (err) + return err; + + sprintf(tmp, "%.28s%s", name, "-%d"); + err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq)); + if (err) { + bdi_destroy(bdi); + return err; + } + + return 0; +} +EXPORT_SYMBOL(bdi_setup_and_register); + static wait_queue_head_t congestion_wqh[2] = { __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) -- cgit v1.2.3 From 5163d90076729413cb882d3dd5c3d3cfb5b9f035 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 22 Apr 2010 12:12:40 +0200 Subject: coda: add bdi backing to mount session This ensures that dirty data gets flushed properly. Signed-off-by: Jens Axboe --- fs/coda/inode.c | 8 ++++++++ include/linux/coda_psdev.h | 2 ++ 2 files changed, 10 insertions(+) (limited to 'include') diff --git a/fs/coda/inode.c b/fs/coda/inode.c index a1695dcadd99..d97f9935a028 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -167,6 +167,10 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) return -EBUSY; } + error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY); + if (error) + goto bdi_err; + vc->vc_sb = sb; sb->s_fs_info = vc; @@ -175,6 +179,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) sb->s_blocksize_bits = 12; sb->s_magic = CODA_SUPER_MAGIC; sb->s_op = &coda_super_operations; + sb->s_bdi = &vc->bdi; /* get root fid from Venus: this needs the root inode */ error = venus_rootfid(sb, &fid); @@ -200,6 +205,8 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) return 0; error: + bdi_destroy(&vc->bdi); + bdi_err: if (root) iput(root); if (vc) @@ -210,6 +217,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) static void coda_put_super(struct super_block *sb) { + bdi_destroy(&coda_vcp(sb)->bdi); coda_vcp(sb)->vc_sb = NULL; sb->s_fs_info = NULL; diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h index 5b5d4731f956..644062e8d857 100644 --- a/include/linux/coda_psdev.h +++ b/include/linux/coda_psdev.h @@ -1,6 +1,7 @@ #ifndef __CODA_PSDEV_H #define __CODA_PSDEV_H +#include #include #define CODA_PSDEV_MAJOR 67 @@ -17,6 +18,7 @@ struct venus_comm { struct list_head vc_processing; int vc_inuse; struct super_block *vc_sb; + struct backing_dev_info bdi; }; -- cgit v1.2.3 From f1970c73cbb6b884152207e4dfe90639f5029905 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 22 Apr 2010 12:31:11 +0200 Subject: ncpfs: add bdi backing to mount session This ensures that dirty data gets flushed properly. Signed-off-by: Jens Axboe --- fs/ncpfs/inode.c | 8 ++++++++ include/linux/ncp_fs_sb.h | 2 ++ 2 files changed, 10 insertions(+) (limited to 'include') diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index cf98da1be23e..fa3385154023 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -526,10 +526,15 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) sb->s_blocksize_bits = 10; sb->s_magic = NCP_SUPER_MAGIC; sb->s_op = &ncp_sops; + sb->s_bdi = &server->bdi; server = NCP_SBP(sb); memset(server, 0, sizeof(*server)); + error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY); + if (error) + goto out_bdi; + server->ncp_filp = ncp_filp; server->ncp_sock = sock; @@ -719,6 +724,8 @@ out_fput2: if (server->info_filp) fput(server->info_filp); out_fput: + bdi_destroy(&server->bdi); +out_bdi: /* 23/12/1998 Marcin Dalecki : * * The previously used put_filp(ncp_filp); was bogous, since @@ -756,6 +763,7 @@ static void ncp_put_super(struct super_block *sb) kill_pid(server->m.wdog_pid, SIGTERM, 1); put_pid(server->m.wdog_pid); + bdi_destroy(&server->bdi); kfree(server->priv.data); kfree(server->auth.object_name); vfree(server->rxbuf); diff --git a/include/linux/ncp_fs_sb.h b/include/linux/ncp_fs_sb.h index 6330fc76b00f..5ec9ca671687 100644 --- a/include/linux/ncp_fs_sb.h +++ b/include/linux/ncp_fs_sb.h @@ -12,6 +12,7 @@ #include #include #include +#include #ifdef __KERNEL__ @@ -127,6 +128,7 @@ struct ncp_server { size_t len; __u8 data[128]; } unexpected_packet; + struct backing_dev_info bdi; }; extern void ncp_tcp_rcv_proc(struct work_struct *work); -- cgit v1.2.3 From 424264b7b220e8eee165dc3080ae48692af73dec Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 22 Apr 2010 12:37:07 +0200 Subject: smbfs: add bdi backing to mount session This ensures that dirty data gets flushed properly. Signed-off-by: Jens Axboe --- fs/smbfs/inode.c | 8 ++++++++ include/linux/smb_fs_sb.h | 3 +++ 2 files changed, 11 insertions(+) (limited to 'include') diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c index 1c4c8f089970..dfa1d67f8fca 100644 --- a/fs/smbfs/inode.c +++ b/fs/smbfs/inode.c @@ -479,6 +479,7 @@ smb_put_super(struct super_block *sb) if (server->conn_pid) kill_pid(server->conn_pid, SIGTERM, 1); + bdi_destroy(&server->bdi); kfree(server->ops); smb_unload_nls(server); sb->s_fs_info = NULL; @@ -525,6 +526,11 @@ static int smb_fill_super(struct super_block *sb, void *raw_data, int silent) if (!server) goto out_no_server; sb->s_fs_info = server; + + if (bdi_setup_and_register(&server->bdi, "smbfs", BDI_CAP_MAP_COPY)) + goto out_bdi; + + sb->s_bdi = &server->bdi; server->super_block = sb; server->mnt = NULL; @@ -624,6 +630,8 @@ out_no_smbiod: out_bad_option: kfree(mem); out_no_mem: + bdi_destroy(&server->bdi); +out_bdi: if (!server->mnt) printk(KERN_ERR "smb_fill_super: allocation failure\n"); sb->s_fs_info = NULL; diff --git a/include/linux/smb_fs_sb.h b/include/linux/smb_fs_sb.h index 8a060a7040d8..bb947dd1fba9 100644 --- a/include/linux/smb_fs_sb.h +++ b/include/linux/smb_fs_sb.h @@ -10,6 +10,7 @@ #define _SMB_FS_SB #include +#include #include /* @@ -74,6 +75,8 @@ struct smb_sb_info { struct smb_ops *ops; struct super_block *super_block; + + struct backing_dev_info bdi; }; static inline int -- cgit v1.2.3 From 71d0a6112a363e703e383ae5b12c492485c39701 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 22 Apr 2010 15:35:57 -0400 Subject: NFS: Fix an unstable write data integrity race Commit 2c61be0a9478258f77b66208a0c4b1f5f8161c3c (NFS: Ensure that the WRITE and COMMIT RPC calls are always uninterruptible) exposed a race on file close. In order to ensure correct close-to-open behaviour, we want to wait for all outstanding background commit operations to complete. This patch adds an inode flag that indicates if a commit operation is under way, and provides a mechanism to allow ->write_inode() to wait for its completion if this is a data integrity flush. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 36 ++++++++++++++++++++++++++++++++---- include/linux/nfs_fs.h | 1 + 2 files changed, 33 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index de38d63aa920..ccde2aeb3fec 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1201,6 +1201,25 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) +static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) +{ + if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags)) + return 1; + if (may_wait && !out_of_line_wait_on_bit_lock(&nfsi->flags, + NFS_INO_COMMIT, nfs_wait_bit_killable, + TASK_KILLABLE)) + return 1; + return 0; +} + +static void nfs_commit_clear_lock(struct nfs_inode *nfsi) +{ + clear_bit(NFS_INO_COMMIT, &nfsi->flags); + smp_mb__after_clear_bit(); + wake_up_bit(&nfsi->flags, NFS_INO_COMMIT); +} + + static void nfs_commitdata_release(void *data) { struct nfs_write_data *wdata = data; @@ -1262,8 +1281,6 @@ static int nfs_commit_rpcsetup(struct list_head *head, task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); - if (how & FLUSH_SYNC) - rpc_wait_for_completion_task(task); rpc_put_task(task); return 0; } @@ -1294,6 +1311,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) BDI_RECLAIMABLE); nfs_clear_page_tag_locked(req); } + nfs_commit_clear_lock(NFS_I(inode)); return -ENOMEM; } @@ -1349,6 +1367,7 @@ static void nfs_commit_release(void *calldata) next: nfs_clear_page_tag_locked(req); } + nfs_commit_clear_lock(NFS_I(data->inode)); nfs_commitdata_release(calldata); } @@ -1363,8 +1382,11 @@ static const struct rpc_call_ops nfs_commit_ops = { static int nfs_commit_inode(struct inode *inode, int how) { LIST_HEAD(head); - int res; + int may_wait = how & FLUSH_SYNC; + int res = 0; + if (!nfs_commit_set_lock(NFS_I(inode), may_wait)) + goto out; spin_lock(&inode->i_lock); res = nfs_scan_commit(inode, &head, 0, 0); spin_unlock(&inode->i_lock); @@ -1372,7 +1394,13 @@ static int nfs_commit_inode(struct inode *inode, int how) int error = nfs_commit_list(inode, &head, how); if (error < 0) return error; - } + if (may_wait) + wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT, + nfs_wait_bit_killable, + TASK_KILLABLE); + } else + nfs_commit_clear_lock(NFS_I(inode)); +out: return res; } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 1a0b85aa151e..07ce4609fe50 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -209,6 +209,7 @@ struct nfs_inode { #define NFS_INO_FLUSHING (4) /* inode is flushing out data */ #define NFS_INO_FSCACHE (5) /* inode can be cached by FS-Cache */ #define NFS_INO_FSCACHE_LOCK (6) /* FS-Cache cookie management lock */ +#define NFS_INO_COMMIT (7) /* inode is committing unstable writes */ static inline struct nfs_inode *NFS_I(const struct inode *inode) { -- cgit v1.2.3 From 3a3076f4d6e2fa31338a0b007df42a3b32f079e0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 23 Apr 2010 12:17:17 -0400 Subject: Cleanup generic block based fiemap This cleans up a few of the complaints of __generic_block_fiemap. I've fixed all the typing stuff, used inline functions instead of macros, gotten rid of a couple of variables, and made sure the size and block requests are all block aligned. It also fixes a problem where sometimes FIEMAP_EXTENT_LAST wasn't being set properly. Signed-off-by: Josef Bacik Signed-off-by: Linus Torvalds --- fs/ioctl.c | 92 +++++++++++++++++++++++++++++++----------------------- include/linux/fs.h | 5 +-- 2 files changed, 56 insertions(+), 41 deletions(-) (limited to 'include') diff --git a/fs/ioctl.c b/fs/ioctl.c index 6c751106c2e5..7faefb4da939 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -228,14 +228,23 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg) #ifdef CONFIG_BLOCK -#define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits) -#define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits); +static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) +{ + return (offset >> inode->i_blkbits); +} + +static inline loff_t blk_to_logical(struct inode *inode, sector_t blk) +{ + return (blk << inode->i_blkbits); +} /** * __generic_block_fiemap - FIEMAP for block based inodes (no locking) - * @inode - the inode to map - * @arg - the pointer to userspace where we copy everything to - * @get_block - the fs's get_block function + * @inode: the inode to map + * @fieinfo: the fiemap info struct that will be passed back to userspace + * @start: where to start mapping in the inode + * @len: how much space to map + * @get_block: the fs's get_block function * * This does FIEMAP for block based inodes. Basically it will just loop * through get_block until we hit the number of extents we want to map, or we @@ -250,58 +259,63 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg) */ int __generic_block_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, u64 start, - u64 len, get_block_t *get_block) + struct fiemap_extent_info *fieinfo, loff_t start, + loff_t len, get_block_t *get_block) { - struct buffer_head tmp; - unsigned long long start_blk; - long long length = 0, map_len = 0; + struct buffer_head map_bh; + sector_t start_blk, last_blk; + loff_t isize = i_size_read(inode); u64 logical = 0, phys = 0, size = 0; u32 flags = FIEMAP_EXTENT_MERGED; - int ret = 0, past_eof = 0, whole_file = 0; + bool past_eof = false, whole_file = false; + int ret = 0; - if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC))) + ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); + if (ret) return ret; - start_blk = logical_to_blk(inode, start); - - length = (long long)min_t(u64, len, i_size_read(inode)); - if (length < len) - whole_file = 1; + /* + * Either the i_mutex or other appropriate locking needs to be held + * since we expect isize to not change at all through the duration of + * this call. + */ + if (len >= isize) { + whole_file = true; + len = isize; + } - map_len = length; + start_blk = logical_to_blk(inode, start); + last_blk = logical_to_blk(inode, start + len - 1); do { /* * we set b_size to the total size we want so it will map as * many contiguous blocks as possible at once */ - memset(&tmp, 0, sizeof(struct buffer_head)); - tmp.b_size = map_len; + memset(&map_bh, 0, sizeof(struct buffer_head)); + map_bh.b_size = len; - ret = get_block(inode, start_blk, &tmp, 0); + ret = get_block(inode, start_blk, &map_bh, 0); if (ret) break; /* HOLE */ - if (!buffer_mapped(&tmp)) { - length -= blk_to_logical(inode, 1); + if (!buffer_mapped(&map_bh)) { start_blk++; /* - * we want to handle the case where there is an + * We want to handle the case where there is an * allocated block at the front of the file, and then * nothing but holes up to the end of the file properly, * to make sure that extent at the front gets properly * marked with FIEMAP_EXTENT_LAST */ if (!past_eof && - blk_to_logical(inode, start_blk) >= - blk_to_logical(inode, 0)+i_size_read(inode)) + blk_to_logical(inode, start_blk) >= isize) past_eof = 1; /* - * first hole after going past the EOF, this is our + * First hole after going past the EOF, this is our * last extent */ if (past_eof && size) { @@ -309,15 +323,18 @@ int __generic_block_fiemap(struct inode *inode, ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); - break; + } else if (size) { + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, flags); + size = 0; } /* if we have holes up to/past EOF then we're done */ - if (length <= 0 || past_eof) + if (start_blk > last_blk || past_eof || ret) break; } else { /* - * we have gone over the length of what we wanted to + * We have gone over the length of what we wanted to * map, and it wasn't the entire file, so add the extent * we got last time and exit. * @@ -331,7 +348,7 @@ int __generic_block_fiemap(struct inode *inode, * are good to go, just add the extent to the fieinfo * and break */ - if (length <= 0 && !whole_file) { + if (start_blk > last_blk && !whole_file) { ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); @@ -351,11 +368,10 @@ int __generic_block_fiemap(struct inode *inode, } logical = blk_to_logical(inode, start_blk); - phys = blk_to_logical(inode, tmp.b_blocknr); - size = tmp.b_size; + phys = blk_to_logical(inode, map_bh.b_blocknr); + size = map_bh.b_size; flags = FIEMAP_EXTENT_MERGED; - length -= tmp.b_size; start_blk += logical_to_blk(inode, size); /* @@ -363,15 +379,13 @@ int __generic_block_fiemap(struct inode *inode, * soon as we find a hole that the last extent we found * is marked with FIEMAP_EXTENT_LAST */ - if (!past_eof && - logical+size >= - blk_to_logical(inode, 0)+i_size_read(inode)) - past_eof = 1; + if (!past_eof && logical + size >= isize) + past_eof = true; } cond_resched(); } while (1); - /* if ret is 1 then we just hit the end of the extent array */ + /* If ret is 1 then we just hit the end of the extent array */ if (ret == 1) ret = 0; diff --git a/include/linux/fs.h b/include/linux/fs.h index 39d57bc6cc71..44f35aea2f1f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2315,8 +2315,9 @@ extern int vfs_fstatat(int , char __user *, struct kstat *, int); extern int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, unsigned long arg); extern int __generic_block_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, u64 start, - u64 len, get_block_t *get_block); + struct fiemap_extent_info *fieinfo, + loff_t start, loff_t len, + get_block_t *get_block); extern int generic_block_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len, get_block_t *get_block); -- cgit v1.2.3 From 23be7468e8802a2ac1de6ee3eecb3ec7f14dc703 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 23 Apr 2010 13:17:56 -0400 Subject: hugetlb: fix infinite loop in get_futex_key() when backed by huge pages If a futex key happens to be located within a huge page mapped MAP_PRIVATE, get_futex_key() can go into an infinite loop waiting for a page->mapping that will never exist. See https://bugzilla.redhat.com/show_bug.cgi?id=552257 for more details about the problem. This patch makes page->mapping a poisoned value that includes PAGE_MAPPING_ANON mapped MAP_PRIVATE. This is enough for futex to continue but because of PAGE_MAPPING_ANON, the poisoned value is not dereferenced or used by futex. No other part of the VM should be dereferencing the page->mapping of a hugetlbfs page as its page cache is not on the LRU. This patch fixes the problem with the test case described in the bugzilla. [akpm@linux-foundation.org: mel cant spel] Signed-off-by: Mel Gorman Acked-by: Peter Zijlstra Acked-by: Darren Hart Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/poison.h | 9 +++++++++ mm/hugetlb.c | 5 ++++- 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/poison.h b/include/linux/poison.h index 2110a81c5e2a..34066ffd893d 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -48,6 +48,15 @@ #define POISON_FREE 0x6b /* for use-after-free poisoning */ #define POISON_END 0xa5 /* end-byte of poisoning */ +/********** mm/hugetlb.c **********/ +/* + * Private mappings of hugetlb pages use this poisoned value for + * page->mapping. The core VM should not be doing anything with this mapping + * but futex requires the existence of some page->mapping value even though it + * is unused if PAGE_MAPPING_ANON is set. + */ +#define HUGETLB_POISON ((void *)(0x00300300 + POISON_POINTER_DELTA + PAGE_MAPPING_ANON)) + /********** arch/$ARCH/mm/init.c **********/ #define POISON_FREE_INITMEM 0xcc diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6034dc9e9796..ffbdfc86aedf 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -546,6 +546,7 @@ static void free_huge_page(struct page *page) mapping = (struct address_space *) page_private(page); set_page_private(page, 0); + page->mapping = NULL; BUG_ON(page_count(page)); INIT_LIST_HEAD(&page->lru); @@ -2447,8 +2448,10 @@ retry: spin_lock(&inode->i_lock); inode->i_blocks += blocks_per_huge_page(h); spin_unlock(&inode->i_lock); - } else + } else { lock_page(page); + page->mapping = HUGETLB_POISON; + } } /* -- cgit v1.2.3 From 5129a469a91a91427334c40e29e64c6d0ab68caf Mon Sep 17 00:00:00 2001 From: Jörn Engel Date: Sun, 25 Apr 2010 08:54:42 +0200 Subject: Catch filesystems lacking s_bdi noop_backing_dev_info is used only as a flag to mark filesystems that don't have any backing store, like tmpfs, procfs, spufs, etc. Signed-off-by: Joern Engel Changed the BUG_ON() to a WARN_ON(). Note that adding dirty inodes to the noop_backing_dev_info is not legal and will not result in them being flushed, but we already catch this condition in __mark_inode_dirty() when checking for a registered bdi. Signed-off-by: Jens Axboe --- fs/super.c | 8 +++++--- fs/sync.c | 3 ++- include/linux/backing-dev.h | 1 + mm/backing-dev.c | 5 +++++ 4 files changed, 13 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/fs/super.c b/fs/super.c index f35ac6022109..dc72491a19f9 100644 --- a/fs/super.c +++ b/fs/super.c @@ -693,6 +693,7 @@ int set_anon_super(struct super_block *s, void *data) return -EMFILE; } s->s_dev = MKDEV(0, dev & MINORMASK); + s->s_bdi = &noop_backing_dev_info; return 0; } @@ -954,10 +955,11 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void if (error < 0) goto out_free_secdata; BUG_ON(!mnt->mnt_sb); + WARN_ON(!mnt->mnt_sb->s_bdi); - error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata); - if (error) - goto out_sb; + error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata); + if (error) + goto out_sb; /* * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE diff --git a/fs/sync.c b/fs/sync.c index fc5c3d75cf3c..92b228176f7c 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "internal.h" #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ @@ -32,7 +33,7 @@ static int __sync_filesystem(struct super_block *sb, int wait) * This should be safe, as we require bdi backing to actually * write out data in the first place */ - if (!sb->s_bdi) + if (!sb->s_bdi || sb->s_bdi == &noop_backing_dev_info) return 0; if (sb->s_qcop && sb->s_qcop->quota_sync) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index e19c677f219c..bd0e3c6f323f 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -247,6 +247,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); #endif extern struct backing_dev_info default_backing_dev_info; +extern struct backing_dev_info noop_backing_dev_info; void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page); int writeback_in_progress(struct backing_dev_info *bdi); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index dbda4707f593..707d0dc6da0f 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -27,6 +27,11 @@ struct backing_dev_info default_backing_dev_info = { }; EXPORT_SYMBOL_GPL(default_backing_dev_info); +struct backing_dev_info noop_backing_dev_info = { + .name = "noop", +}; +EXPORT_SYMBOL_GPL(noop_backing_dev_info); + static struct class *bdi_class; /* -- cgit v1.2.3 From 33f60e9640b2f60dde6735293d4aa5ecc5b1d5d5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 28 Apr 2010 09:20:33 +0200 Subject: coda: move backing-dev.h kernel include inside __KERNEL__ Otherwise we must export backing-dev.h as well, which doesn't make any sense. Signed-off-by: Jens Axboe --- include/linux/coda_psdev.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h index 644062e8d857..8859e2ede9fe 100644 --- a/include/linux/coda_psdev.h +++ b/include/linux/coda_psdev.h @@ -1,13 +1,14 @@ #ifndef __CODA_PSDEV_H #define __CODA_PSDEV_H -#include #include #define CODA_PSDEV_MAJOR 67 #define MAX_CODADEVS 5 /* how many do we allow */ #ifdef __KERNEL__ +#include + struct kstatfs; /* communication pending/processing queues */ -- cgit v1.2.3 From 561b1733a465cf9677356b40c27653dd45f1ac56 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 28 Apr 2010 08:47:18 +0000 Subject: sctp: avoid irq lock inversion while call sk->sk_data_ready() sk->sk_data_ready() of sctp socket can be called from both BH and non-BH contexts, but the default sk->sk_data_ready(), sock_def_readable(), can not be used in this case. Therefore, we have to make a new function sctp_data_ready() to grab sk->sk_data_ready() with BH disabling. ========================================================= [ INFO: possible irq lock inversion dependency detected ] 2.6.33-rc6 #129 --------------------------------------------------------- sctp_darn/1517 just changed the state of lock: (clock-AF_INET){++.?..}, at: [] sock_def_readable+0x20/0x80 but this lock took another, SOFTIRQ-unsafe lock in the past: (slock-AF_INET){+.-...} and interrupts could create inverse lock ordering between them. other info that might help us debug this: 1 lock held by sctp_darn/1517: #0: (sk_lock-AF_INET){+.+.+.}, at: [] sctp_sendmsg+0x23d/0xc00 [sctp] Signed-off-by: Wei Yongjun Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 1 + net/sctp/endpointola.c | 1 + net/sctp/socket.c | 10 ++++++++++ 3 files changed, 12 insertions(+) (limited to 'include') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 78740ec57d5d..fa6cde578a1d 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -128,6 +128,7 @@ extern int sctp_register_pf(struct sctp_pf *, sa_family_t); int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb); int sctp_inet_listen(struct socket *sock, int backlog); void sctp_write_space(struct sock *sk); +void sctp_data_ready(struct sock *sk, int len); unsigned int sctp_poll(struct file *file, struct socket *sock, poll_table *wait); void sctp_sock_rfree(struct sk_buff *skb); diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 905fda582b92..7ec09ba03a1c 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -144,6 +144,7 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, /* Use SCTP specific send buffer space queues. */ ep->sndbuf_policy = sctp_sndbuf_policy; + sk->sk_data_ready = sctp_data_ready; sk->sk_write_space = sctp_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 007e8baba089..efa2bc3f0028 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -6189,6 +6189,16 @@ do_nonblock: goto out; } +void sctp_data_ready(struct sock *sk, int len) +{ + read_lock_bh(&sk->sk_callback_lock); + if (sk_has_sleeper(sk)) + wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN | + POLLRDNORM | POLLRDBAND); + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + read_unlock_bh(&sk->sk_callback_lock); +} + /* If socket sndbuf has changed, wake up all per association waiters. */ void sctp_write_space(struct sock *sk) { -- cgit v1.2.3 From c0786693404cffd80ca3cb6e75ee7b35186b2825 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Wed, 28 Apr 2010 08:47:22 +0000 Subject: sctp: Fix oops when sending queued ASCONF chunks When we finish processing ASCONF_ACK chunk, we try to send the next queued ASCONF. This action runs the sctp state machine recursively and it's not prepared to do so. kernel BUG at kernel/timer.c:790! invalid opcode: 0000 [#1] SMP last sysfs file: /sys/module/ipv6/initstate Modules linked in: sha256_generic sctp libcrc32c ipv6 dm_multipath uinput 8139too i2c_piix4 8139cp mii i2c_core pcspkr virtio_net joydev floppy virtio_blk virtio_pci [last unloaded: scsi_wait_scan] Pid: 0, comm: swapper Not tainted 2.6.34-rc4 #15 /Bochs EIP: 0060:[] EFLAGS: 00010286 CPU: 0 EIP is at add_timer+0xd/0x1b EAX: cecbab14 EBX: 000000f0 ECX: c0957b1c EDX: 03595cf4 ESI: cecba800 EDI: cf276f00 EBP: c0957aa0 ESP: c0957aa0 DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 Process swapper (pid: 0, ti=c0956000 task=c0988ba0 task.ti=c0956000) Stack: c0957ae0 d1851214 c0ab62e4 c0ab5f26 0500ffff 00000004 00000005 00000004 <0> 00000000 d18694fd 00000004 1666b892 cecba800 cecba800 c0957b14 00000004 <0> c0957b94 d1851b11 ceda8b00 cecba800 cf276f00 00000001 c0957b14 000000d0 Call Trace: [] ? sctp_side_effects+0x607/0xdfc [sctp] [] ? sctp_do_sm+0x108/0x159 [sctp] [] ? sctp_pname+0x0/0x1d [sctp] [] ? sctp_primitive_ASCONF+0x36/0x3b [sctp] [] ? sctp_process_asconf_ack+0x2a4/0x2d3 [sctp] [] ? sctp_sf_do_asconf_ack+0x1dd/0x2b4 [sctp] [] ? sctp_do_sm+0xb8/0x159 [sctp] [] ? sctp_cname+0x0/0x52 [sctp] [] ? sctp_assoc_bh_rcv+0xac/0xe1 [sctp] [] ? sctp_inq_push+0x2d/0x30 [sctp] [] ? sctp_rcv+0x797/0x82e [sctp] Tested-by: Wei Yongjun Signed-off-by: Yuansong Qiao Signed-off-by: Shuaijun Zhang Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- include/net/sctp/command.h | 1 + net/sctp/sm_make_chunk.c | 15 --------------- net/sctp/sm_sideeffect.c | 26 ++++++++++++++++++++++++++ net/sctp/sm_statefuns.c | 8 +++++++- 4 files changed, 34 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/net/sctp/command.h b/include/net/sctp/command.h index 8be5135ff7aa..2c55a7ea20af 100644 --- a/include/net/sctp/command.h +++ b/include/net/sctp/command.h @@ -107,6 +107,7 @@ typedef enum { SCTP_CMD_T1_RETRAN, /* Mark for retransmission after T1 timeout */ SCTP_CMD_UPDATE_INITTAG, /* Update peer inittag */ SCTP_CMD_SEND_MSG, /* Send the whole use message */ + SCTP_CMD_SEND_NEXT_ASCONF, /* Send the next ASCONF after ACK */ SCTP_CMD_LAST } sctp_verb_t; diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index f6fc5c1a4078..0fd5b4c88358 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -3318,21 +3318,6 @@ int sctp_process_asconf_ack(struct sctp_association *asoc, sctp_chunk_free(asconf); asoc->addip_last_asconf = NULL; - /* Send the next asconf chunk from the addip chunk queue. */ - if (!list_empty(&asoc->addip_chunk_list)) { - struct list_head *entry = asoc->addip_chunk_list.next; - asconf = list_entry(entry, struct sctp_chunk, list); - - list_del_init(entry); - - /* Hold the chunk until an ASCONF_ACK is received. */ - sctp_chunk_hold(asconf); - if (sctp_primitive_ASCONF(asoc, asconf)) - sctp_chunk_free(asconf); - else - asoc->addip_last_asconf = asconf; - } - return retval; } diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 4c5bed9af4e3..d5ae450b6f02 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -962,6 +962,29 @@ static int sctp_cmd_send_msg(struct sctp_association *asoc, } +/* Sent the next ASCONF packet currently stored in the association. + * This happens after the ASCONF_ACK was succeffully processed. + */ +static void sctp_cmd_send_asconf(struct sctp_association *asoc) +{ + /* Send the next asconf chunk from the addip chunk + * queue. + */ + if (!list_empty(&asoc->addip_chunk_list)) { + struct list_head *entry = asoc->addip_chunk_list.next; + struct sctp_chunk *asconf = list_entry(entry, + struct sctp_chunk, list); + list_del_init(entry); + + /* Hold the chunk until an ASCONF_ACK is received. */ + sctp_chunk_hold(asconf); + if (sctp_primitive_ASCONF(asoc, asconf)) + sctp_chunk_free(asconf); + else + asoc->addip_last_asconf = asconf; + } +} + /* These three macros allow us to pull the debugging code out of the * main flow of sctp_do_sm() to keep attention focused on the real @@ -1617,6 +1640,9 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, } error = sctp_cmd_send_msg(asoc, cmd->obj.msg); break; + case SCTP_CMD_SEND_NEXT_ASCONF: + sctp_cmd_send_asconf(asoc); + break; default: printk(KERN_WARNING "Impossible command: %u, %p\n", cmd->verb, cmd->obj.ptr); diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index abf601a1b847..24b2cd555637 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -3676,8 +3676,14 @@ sctp_disposition_t sctp_sf_do_asconf_ack(const struct sctp_endpoint *ep, SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO)); if (!sctp_process_asconf_ack((struct sctp_association *)asoc, - asconf_ack)) + asconf_ack)) { + /* Successfully processed ASCONF_ACK. We can + * release the next asconf if we have one. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_SEND_NEXT_ASCONF, + SCTP_NULL()); return SCTP_DISPOSITION_CONSUME; + } abort = sctp_make_abort(asoc, asconf_ack, sizeof(sctp_errhdr_t)); -- cgit v1.2.3