From 4cd9069a0a0e5fb8b007425c937642682ac96c76 Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Wed, 25 Apr 2012 14:53:05 +0100 Subject: fs: remove 8 bytes of padding from struct writeback_control on 64 bit builds Reorder structure writeback_control to remove 8 bytes of padding on 64 bit builds, this shrinks its size from 48 to 40 bytes. This structure is always on the stack and uses C99 named initialisation, so should be safe and have a small impact on stack usage. Signed-off-by: Richard Kennedy Signed-off-by: Fengguang Wu --- include/linux/writeback.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/writeback.h') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a2b84f598e2b..3309736ff059 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -58,7 +58,6 @@ extern const char *wb_reason_name[]; * in a manner such that unspecified fields are set to zero. */ struct writeback_control { - enum writeback_sync_modes sync_mode; long nr_to_write; /* Write this many pages, and decrement this for each page written */ long pages_skipped; /* Pages which were not written */ @@ -71,6 +70,8 @@ struct writeback_control { loff_t range_start; loff_t range_end; + enum writeback_sync_modes sync_mode; + unsigned for_kupdate:1; /* A kupdate writeback */ unsigned for_background:1; /* A background writeback */ unsigned tagged_writepages:1; /* tag-and-write to avoid livelock */ -- cgit v1.2.3 From 169ebd90131b2ffca74bb2dbe7eeacd39fb83714 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 3 May 2012 14:48:03 +0200 Subject: writeback: Avoid iput() from flusher thread Doing iput() from flusher thread (writeback_sb_inodes()) can create problems because iput() can do a lot of work - for example truncate the inode if it's the last iput on unlinked file. Some filesystems depend on flusher thread progressing (e.g. because they need to flush delay allocated blocks to reduce allocation uncertainty) and so flusher thread doing truncate creates interesting dependencies and possibilities for deadlocks. We get rid of iput() in flusher thread by using the fact that I_SYNC inode flag effectively pins the inode in memory. So if we take care to either hold i_lock or have I_SYNC set, we can get away without taking inode reference in writeback_sb_inodes(). As a side effect of these changes, we also fix possible use-after-free in wb_writeback() because inode_wait_for_writeback() call could try to reacquire i_lock on the inode that was already free. Signed-off-by: Jan Kara Signed-off-by: Fengguang Wu --- fs/fs-writeback.c | 66 +++++++++++++++++++++++++++++++++++++---------- fs/inode.c | 8 +++++- include/linux/fs.h | 7 ++--- include/linux/writeback.h | 7 +---- 4 files changed, 65 insertions(+), 23 deletions(-) (limited to 'include/linux/writeback.h') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 5f2c68289610..8d2fb8c88cf3 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -326,9 +326,12 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc) } /* - * Wait for writeback on an inode to complete. + * Wait for writeback on an inode to complete. Called with i_lock held. + * Caller must make sure inode cannot go away when we drop i_lock. */ -static void inode_wait_for_writeback(struct inode *inode) +static void __inode_wait_for_writeback(struct inode *inode) + __releases(inode->i_lock) + __acquires(inode->i_lock) { DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); wait_queue_head_t *wqh; @@ -341,6 +344,36 @@ static void inode_wait_for_writeback(struct inode *inode) } } +/* + * Wait for writeback on an inode to complete. Caller must have inode pinned. + */ +void inode_wait_for_writeback(struct inode *inode) +{ + spin_lock(&inode->i_lock); + __inode_wait_for_writeback(inode); + spin_unlock(&inode->i_lock); +} + +/* + * Sleep until I_SYNC is cleared. This function must be called with i_lock + * held and drops it. It is aimed for callers not holding any inode reference + * so once i_lock is dropped, inode can go away. + */ +static void inode_sleep_on_writeback(struct inode *inode) + __releases(inode->i_lock) +{ + DEFINE_WAIT(wait); + wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC); + int sleep; + + prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + sleep = inode->i_state & I_SYNC; + spin_unlock(&inode->i_lock); + if (sleep) + schedule(); + finish_wait(wqh, &wait); +} + /* * Find proper writeback list for the inode depending on its current state and * possibly also change of its state while we were doing writeback. Here we @@ -479,9 +512,11 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, if (wbc->sync_mode != WB_SYNC_ALL) goto out; /* - * It's a data-integrity sync. We must wait. + * It's a data-integrity sync. We must wait. Since callers hold + * inode reference or inode has I_WILL_FREE set, it cannot go + * away under us. */ - inode_wait_for_writeback(inode); + __inode_wait_for_writeback(inode); } WARN_ON(inode->i_state & I_SYNC); /* @@ -620,20 +655,28 @@ static long writeback_sb_inodes(struct super_block *sb, } spin_unlock(&wb->list_lock); - __iget(inode); /* * We already requeued the inode if it had I_SYNC set and we * are doing WB_SYNC_NONE writeback. So this catches only the * WB_SYNC_ALL case. */ - if (inode->i_state & I_SYNC) - inode_wait_for_writeback(inode); + if (inode->i_state & I_SYNC) { + /* Wait for I_SYNC. This function drops i_lock... */ + inode_sleep_on_writeback(inode); + /* Inode may be gone, start again */ + continue; + } inode->i_state |= I_SYNC; spin_unlock(&inode->i_lock); + write_chunk = writeback_chunk_size(wb->bdi, work); wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; + /* + * We use I_SYNC to pin the inode in memory. While it is set + * evict_inode() will wait so the inode cannot be freed. + */ __writeback_single_inode(inode, wb, &wbc); work->nr_pages -= write_chunk - wbc.nr_to_write; @@ -645,10 +688,7 @@ static long writeback_sb_inodes(struct super_block *sb, requeue_inode(inode, wb, &wbc); inode_sync_complete(inode); spin_unlock(&inode->i_lock); - spin_unlock(&wb->list_lock); - iput(inode); - cond_resched(); - spin_lock(&wb->list_lock); + cond_resched_lock(&wb->list_lock); /* * bail out to wb_writeback() often enough to check * background threshold and other termination conditions. @@ -843,8 +883,8 @@ static long wb_writeback(struct bdi_writeback *wb, inode = wb_inode(wb->b_more_io.prev); spin_lock(&inode->i_lock); spin_unlock(&wb->list_lock); - inode_wait_for_writeback(inode); - spin_unlock(&inode->i_lock); + /* This function drops i_lock... */ + inode_sleep_on_writeback(inode); spin_lock(&wb->list_lock); } } diff --git a/fs/inode.c b/fs/inode.c index 02c0fa5e16a4..f4e145016611 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -530,7 +530,13 @@ static void evict(struct inode *inode) inode_sb_list_del(inode); - inode_sync_wait(inode); + /* + * Wait for flusher thread to be done with the inode so that filesystem + * does not start destroying it while writeback is still running. Since + * the inode has I_FREEING set, flusher thread won't start new work on + * the inode. We just have to wait for running writeback to finish. + */ + inode_wait_for_writeback(inode); if (op->evict_inode) { op->evict_inode(inode); diff --git a/include/linux/fs.h b/include/linux/fs.h index c79316c79ee3..1c71e7f4d234 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1753,9 +1753,10 @@ struct super_operations { * anew. Other functions will just ignore such inodes, * if appropriate. I_NEW is used for waiting. * - * I_SYNC Synchonized write of dirty inode data. The bits is - * set during data writeback, and cleared with a wakeup - * on the bit address once it is done. + * I_SYNC Writeback of inode is running. The bit is set during + * data writeback, and cleared with a wakeup on the bit + * address once it is done. The bit is also used to pin + * the inode in memory for flusher thread. * * I_REFERENCED Marks the inode as recently references on the LRU list. * diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 3309736ff059..6d0a0fcd80e7 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -95,6 +95,7 @@ long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, enum wb_reason reason); long wb_do_writeback(struct bdi_writeback *wb, int force_wait); void wakeup_flusher_threads(long nr_pages, enum wb_reason reason); +void inode_wait_for_writeback(struct inode *inode); /* writeback.h requires fs.h; it, too, is not included from here. */ static inline void wait_on_inode(struct inode *inode) @@ -102,12 +103,6 @@ static inline void wait_on_inode(struct inode *inode) might_sleep(); wait_on_bit(&inode->i_state, __I_NEW, inode_wait, TASK_UNINTERRUPTIBLE); } -static inline void inode_sync_wait(struct inode *inode) -{ - might_sleep(); - wait_on_bit(&inode->i_state, __I_SYNC, inode_wait, - TASK_UNINTERRUPTIBLE); -} /* -- cgit v1.2.3 From 3965c9ae47d64aadf6f13b6fcd37767b83c0689a Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 31 Jul 2012 16:41:52 -0700 Subject: mm: prepare for removal of obsolete /proc/sys/vm/nr_pdflush_threads Since per-BDI flusher threads were introduced in 2.6, the pdflush mechanism is not used any more. But the old interface exported through /proc/sys/vm/nr_pdflush_threads still exists and is obviously useless. For back-compatibility, printk warning information and return 2 to notify the users that the interface is removed. Signed-off-by: Wanpeng Li Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../ABI/obsolete/proc-sys-vm-nr_pdflush_threads | 5 +++++ Documentation/feature-removal-schedule.txt | 8 ++++++++ Documentation/sysctl/vm.txt | 11 ----------- fs/fs-writeback.c | 5 ----- include/linux/backing-dev.h | 3 +++ include/linux/writeback.h | 5 ----- kernel/sysctl.c | 8 +++----- kernel/sysctl_binary.c | 2 +- mm/backing-dev.c | 20 ++++++++++++++++++++ 9 files changed, 40 insertions(+), 27 deletions(-) create mode 100644 Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads (limited to 'include/linux/writeback.h') diff --git a/Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads b/Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads new file mode 100644 index 000000000000..b0b0eeb20fe3 --- /dev/null +++ b/Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads @@ -0,0 +1,5 @@ +What: /proc/sys/vm/nr_pdflush_threads +Date: June 2012 +Contact: Wanpeng Li +Description: Since pdflush is replaced by per-BDI flusher, the interface of old pdflush + exported in /proc/sys/vm/ should be removed. diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index e9237fb71950..88f2fa48bb63 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -13,6 +13,14 @@ Who: Jim Cromie , Jason Baron --------------------------- +What: /proc/sys/vm/nr_pdflush_threads +When: 2012 +Why: Since pdflush is deprecated, the interface exported in /proc/sys/vm/ + should be removed. +Who: Wanpeng Li + +--------------------------- + What: CONFIG_APM_CPU_IDLE, and its ability to call APM BIOS in idle When: 2012 Why: This optional sub-feature of APM is of dubious reliability, diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 84eb25cd69aa..06d662b1c5d5 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -42,7 +42,6 @@ Currently, these files are in /proc/sys/vm: - mmap_min_addr - nr_hugepages - nr_overcommit_hugepages -- nr_pdflush_threads - nr_trim_pages (only if CONFIG_MMU=n) - numa_zonelist_order - oom_dump_tasks @@ -426,16 +425,6 @@ See Documentation/vm/hugetlbpage.txt ============================================================== -nr_pdflush_threads - -The current number of pdflush threads. This value is read-only. -The value changes according to the number of dirty pages in the system. - -When necessary, additional pdflush threads are created, one per second, up to -nr_pdflush_threads_max. - -============================================================== - nr_trim_pages This is available only on NOMMU kernels. diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 50d0b78130a1..be3efc4f64f4 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -52,11 +52,6 @@ struct wb_writeback_work { struct completion *done; /* set if the caller waits */ }; -/* - * We don't actually have pdflush, but this one is exported though /proc... - */ -int nr_pdflush_threads; - /** * writeback_in_progress - determine whether there is writeback in progress * @bdi: the device's backing_dev_info structure. diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 489de625cd25..c97c6b9cd38e 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -17,6 +17,7 @@ #include #include #include +#include struct page; struct device; @@ -304,6 +305,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync); void set_bdi_congested(struct backing_dev_info *bdi, int sync); long congestion_wait(int sync, long timeout); long wait_iff_congested(struct zone *zone, int sync, long timeout); +int pdflush_proc_obsolete(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi) { diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 6d0a0fcd80e7..c66fe3332d83 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -189,9 +189,4 @@ void tag_pages_for_writeback(struct address_space *mapping, void account_page_redirty(struct page *page); -/* pdflush.c */ -extern int nr_pdflush_threads; /* Global so it can be exported to sysctl - read-only. */ - - #endif /* WRITEBACK_H */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 97186b99b0e4..6502d35a25ba 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1101,11 +1101,9 @@ static struct ctl_table vm_table[] = { .extra1 = &zero, }, { - .procname = "nr_pdflush_threads", - .data = &nr_pdflush_threads, - .maxlen = sizeof nr_pdflush_threads, - .mode = 0444 /* read-only*/, - .proc_handler = proc_dointvec, + .procname = "nr_pdflush_threads", + .mode = 0444 /* read-only */, + .proc_handler = pdflush_proc_obsolete, }, { .procname = "swappiness", diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index a650694883a1..65bdcf198d4e 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -147,7 +147,7 @@ static const struct bin_table bin_vm_table[] = { { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" }, /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */ /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */ - { CTL_INT, VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" }, + /* VM_NR_PDFLUSH_THREADS "nr_pdflush_threads" no longer used */ { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" }, /* VM_PAGEBUF unused */ /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 3387aea11209..6b4718e2ee34 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -886,3 +886,23 @@ out: return ret; } EXPORT_SYMBOL(wait_iff_congested); + +int pdflush_proc_obsolete(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + char kbuf[] = "0\n"; + + if (*ppos) { + *lenp = 0; + return 0; + } + + if (copy_to_user(buffer, kbuf, sizeof(kbuf))) + return -EFAULT; + printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n", + table->procname); + + *lenp = 2; + *ppos += *lenp; + return 2; +} -- cgit v1.2.3