From 31e332b911fca54df467d264d7e2a2ef9317f3ca Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Mon, 6 Oct 2025 01:15:26 +0200 Subject: fs: add missing fences to I_NEW handling Suppose there are 2 CPUs racing inode hash lookup func (say ilookup5()) and unlock_new_inode(). In principle the latter can clear the I_NEW flag before prior stores into the inode were made visible. The former can in turn observe I_NEW is cleared and proceed to use the inode, while possibly reading from not-yet-published areas. Signed-off-by: Mateusz Guzik Signed-off-by: Christian Brauner --- include/linux/writeback.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/writeback.h') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 22dd4adc5667..e1e1231a6830 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -194,6 +194,10 @@ static inline void wait_on_inode(struct inode *inode) { wait_var_event(inode_state_wait_address(inode, __I_NEW), !(READ_ONCE(inode->i_state) & I_NEW)); + /* + * Pairs with routines clearing I_NEW. + */ + smp_rmb(); } #ifdef CONFIG_CGROUP_WRITEBACK -- cgit v1.2.3 From af6023e2ce0a3d4d948885d464b0ddca4b8b1fdf Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 9 Oct 2025 09:59:15 +0200 Subject: fs: move wait_on_inode() from writeback.h to fs.h The only consumer outside of fs/inode.c is gfs2 and it already includes fs.h in the relevant file. Signed-off-by: Mateusz Guzik Signed-off-by: Christian Brauner --- include/linux/fs.h | 10 ++++++++++ include/linux/writeback.h | 11 ----------- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'include/linux/writeback.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index ac62b9d10b00..b35014ba681b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -949,6 +949,16 @@ static inline void inode_fake_hash(struct inode *inode) hlist_add_fake(&inode->i_hash); } +static inline void wait_on_inode(struct inode *inode) +{ + wait_var_event(inode_state_wait_address(inode, __I_NEW), + !(READ_ONCE(inode->i_state) & I_NEW)); + /* + * Pairs with routines clearing I_NEW. + */ + smp_rmb(); +} + /* * inode->i_rwsem nesting subclasses for the lock validator: * diff --git a/include/linux/writeback.h b/include/linux/writeback.h index e1e1231a6830..06195c2a535b 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -189,17 +189,6 @@ void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, void inode_wait_for_writeback(struct inode *inode); void inode_io_list_del(struct inode *inode); -/* writeback.h requires fs.h; it, too, is not included from here. */ -static inline void wait_on_inode(struct inode *inode) -{ - wait_var_event(inode_state_wait_address(inode, __I_NEW), - !(READ_ONCE(inode->i_state) & I_NEW)); - /* - * Pairs with routines clearing I_NEW. - */ - smp_rmb(); -} - #ifdef CONFIG_CGROUP_WRITEBACK #include -- cgit v1.2.3 From f5aa78e2be066f3801785094f1b55a3114fe461a Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 9 Oct 2025 09:59:19 +0200 Subject: Manual conversion to use ->i_state accessors of all places not covered by coccinelle Nothing to look at apart from iput_final(). Signed-off-by: Mateusz Guzik Signed-off-by: Christian Brauner --- Documentation/filesystems/porting.rst | 2 +- fs/afs/inode.c | 2 +- fs/ext4/inode.c | 10 +++++----- fs/ext4/orphan.c | 4 ++-- fs/inode.c | 18 ++++++++---------- include/linux/backing-dev.h | 2 +- include/linux/fs.h | 6 +++--- include/linux/writeback.h | 2 +- include/trace/events/writeback.h | 8 ++++---- 9 files changed, 26 insertions(+), 28 deletions(-) (limited to 'include/linux/writeback.h') diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 7233b04668fc..35f027981b21 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -211,7 +211,7 @@ test and set for you. e.g.:: inode = iget_locked(sb, ino); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { err = read_inode_from_disk(inode); if (err < 0) { iget_failed(inode); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 2fe2ccf59c7a..dde1857fcabb 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -427,7 +427,7 @@ static void afs_fetch_status_success(struct afs_operation *op) struct afs_vnode *vnode = vp->vnode; int ret; - if (vnode->netfs.inode.i_state & I_NEW) { + if (inode_state_read_once(&vnode->netfs.inode) & I_NEW) { ret = afs_inode_init_from_status(op, vp, vnode); afs_op_set_error(op, ret); if (ret == 0) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f9e4ac87211e..b864e9645f85 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -425,7 +425,7 @@ void ext4_check_map_extents_env(struct inode *inode) if (!S_ISREG(inode->i_mode) || IS_NOQUOTA(inode) || IS_VERITY(inode) || is_special_ino(inode->i_sb, inode->i_ino) || - (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) || + (inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) || ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || ext4_verity_in_progress(inode)) return; @@ -3473,7 +3473,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode) /* Any metadata buffers to write? */ if (!list_empty(&inode->i_mapping->i_private_list)) return true; - return inode->i_state & I_DIRTY_DATASYNC; + return inode_state_read_once(inode) & I_DIRTY_DATASYNC; } static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, @@ -4552,7 +4552,7 @@ int ext4_truncate(struct inode *inode) * or it's a completely new inode. In those cases we might not * have i_rwsem locked because it's not necessary. */ - if (!(inode->i_state & (I_NEW|I_FREEING))) + if (!(inode_state_read_once(inode) & (I_NEW | I_FREEING))) WARN_ON(!inode_is_locked(inode)); trace_ext4_truncate_enter(inode); @@ -5210,7 +5210,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) { + if (!(inode_state_read_once(inode) & I_NEW)) { ret = check_igot_inode(inode, flags, function, line); if (ret) { iput(inode); @@ -5541,7 +5541,7 @@ static void __ext4_update_other_inode_time(struct super_block *sb, if (inode_is_dirtytime_only(inode)) { struct ext4_inode_info *ei = EXT4_I(inode); - inode->i_state &= ~I_DIRTY_TIME; + inode_state_clear(inode, I_DIRTY_TIME); spin_unlock(&inode->i_lock); spin_lock(&ei->i_raw_lock); diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index 33c3a89396b1..c4903d98ff81 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -107,7 +107,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) if (!sbi->s_journal || is_bad_inode(inode)) return 0; - WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && + WARN_ON_ONCE(!(inode_state_read_once(inode) & (I_NEW | I_FREEING)) && !inode_is_locked(inode)); if (ext4_inode_orphan_tracked(inode)) return 0; @@ -232,7 +232,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS)) return 0; - WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && + WARN_ON_ONCE(!(inode_state_read_once(inode) & (I_NEW | I_FREEING)) && !inode_is_locked(inode)); if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE)) return ext4_orphan_file_del(handle, inode); diff --git a/fs/inode.c b/fs/inode.c index f094ed3e6f30..3153d725859c 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -829,7 +829,7 @@ static void evict(struct inode *inode) * This also means we don't need any fences for the call below. */ inode_wake_up_bit(inode, __I_NEW); - BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); + BUG_ON(inode_state_read_once(inode) != (I_FREEING | I_CLEAR)); destroy_inode(inode); } @@ -1883,7 +1883,6 @@ static void iput_final(struct inode *inode) { struct super_block *sb = inode->i_sb; const struct super_operations *op = inode->i_sb->s_op; - unsigned long state; int drop; WARN_ON(inode_state_read(inode) & I_NEW); @@ -1908,20 +1907,19 @@ static void iput_final(struct inode *inode) */ VFS_BUG_ON_INODE(atomic_read(&inode->i_count) != 0, inode); - state = inode_state_read(inode); - if (!drop) { - WRITE_ONCE(inode->i_state, state | I_WILL_FREE); + if (drop) { + inode_state_set(inode, I_FREEING); + } else { + inode_state_set(inode, I_WILL_FREE); spin_unlock(&inode->i_lock); write_inode_now(inode, 1); spin_lock(&inode->i_lock); - state = inode_state_read(inode); - WARN_ON(state & I_NEW); - state &= ~I_WILL_FREE; + WARN_ON(inode_state_read(inode) & I_NEW); + inode_state_replace(inode, I_WILL_FREE, I_FREEING); } - WRITE_ONCE(inode->i_state, state | I_FREEING); if (!list_empty(&inode->i_lru)) inode_lru_list_del(inode); spin_unlock(&inode->i_lock); @@ -2985,7 +2983,7 @@ void dump_inode(struct inode *inode, const char *reason) pr_warn("%s encountered for inode %px\n" "fs %s mode %ho opflags 0x%hx flags 0x%x state 0x%x count %d\n", reason, inode, sb->s_type->name, inode->i_mode, inode->i_opflags, - inode->i_flags, inode->i_state, atomic_read(&inode->i_count)); + inode->i_flags, inode_state_read_once(inode), atomic_read(&inode->i_count)); } EXPORT_SYMBOL(dump_inode); diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 065cba5dc111..0c8342747cab 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -280,7 +280,7 @@ unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) * Paired with a release fence in inode_do_switch_wbs() and * ensures that we see the new wb if we see cleared I_WB_SWITCH. */ - cookie->locked = inode->i_state & I_WB_SWITCH; + cookie->locked = inode_state_read_once(inode) & I_WB_SWITCH; smp_rmb(); if (unlikely(cookie->locked)) diff --git a/include/linux/fs.h b/include/linux/fs.h index 909eb1e68637..77b6486dcae7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1026,7 +1026,7 @@ static inline void inode_fake_hash(struct inode *inode) static inline void wait_on_inode(struct inode *inode) { wait_var_event(inode_state_wait_address(inode, __I_NEW), - !(READ_ONCE(inode->i_state) & I_NEW)); + !(inode_state_read_once(inode) & I_NEW)); /* * Pairs with routines clearing I_NEW. */ @@ -2719,8 +2719,8 @@ static inline int icount_read(const struct inode *inode) */ static inline bool inode_is_dirtytime_only(struct inode *inode) { - return (inode->i_state & (I_DIRTY_TIME | I_NEW | - I_FREEING | I_WILL_FREE)) == I_DIRTY_TIME; + return (inode_state_read_once(inode) & + (I_DIRTY_TIME | I_NEW | I_FREEING | I_WILL_FREE)) == I_DIRTY_TIME; } extern void inc_nlink(struct inode *inode); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 06195c2a535b..102071ffedcb 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -227,7 +227,7 @@ static inline void inode_attach_wb(struct inode *inode, struct folio *folio) static inline void inode_detach_wb(struct inode *inode) { if (inode->i_wb) { - WARN_ON_ONCE(!(inode->i_state & I_CLEAR)); + WARN_ON_ONCE(!(inode_state_read_once(inode) & I_CLEAR)); wb_put(inode->i_wb); inode->i_wb = NULL; } diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index c08aff044e80..311a341e6fe4 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -120,7 +120,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template, /* may be called for files on pseudo FSes w/ unregistered bdi */ strscpy_pad(__entry->name, bdi_dev_name(bdi), 32); __entry->ino = inode->i_ino; - __entry->state = inode->i_state; + __entry->state = inode_state_read_once(inode); __entry->flags = flags; ), @@ -748,7 +748,7 @@ TRACE_EVENT(writeback_sb_inodes_requeue, strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; - __entry->state = inode->i_state; + __entry->state = inode_state_read_once(inode); __entry->dirtied_when = inode->dirtied_when; __entry->cgroup_ino = __trace_wb_assign_cgroup(inode_to_wb(inode)); ), @@ -787,7 +787,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; - __entry->state = inode->i_state; + __entry->state = inode_state_read_once(inode); __entry->dirtied_when = inode->dirtied_when; __entry->writeback_index = inode->i_mapping->writeback_index; __entry->nr_to_write = nr_to_write; @@ -839,7 +839,7 @@ DECLARE_EVENT_CLASS(writeback_inode_template, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->state = inode->i_state; + __entry->state = inode_state_read_once(inode); __entry->mode = inode->i_mode; __entry->dirtied_when = inode->dirtied_when; ), -- cgit v1.2.3 From 90db4d4441f58d433ecf74f7e3bd17e0a553c20c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Oct 2025 05:45:48 +0200 Subject: writeback: allow the file system to override MIN_WRITEBACK_PAGES The relatively low minimal writeback size of 4MiB means that written back inodes on rotational media are switched a lot. Besides introducing additional seeks, this also can lead to extreme file fragmentation on zoned devices when a lot of files are cached relative to the available writeback bandwidth. Add a superblock field that allows the file system to override the default size. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251017034611.651385-3-hch@lst.de Reviewed-by: Jan Kara Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 14 +++++--------- fs/super.c | 1 + include/linux/fs.h | 1 + include/linux/writeback.h | 5 +++++ 4 files changed, 12 insertions(+), 9 deletions(-) (limited to 'include/linux/writeback.h') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 30de37865fa1..52763fa499d6 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -32,11 +32,6 @@ #include #include "internal.h" -/* - * 4MB minimal write chunk size - */ -#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10)) - /* * Passed into wb_writeback(), essentially a subset of writeback_control */ @@ -1889,8 +1884,8 @@ out: return ret; } -static long writeback_chunk_size(struct bdi_writeback *wb, - struct wb_writeback_work *work) +static long writeback_chunk_size(struct super_block *sb, + struct bdi_writeback *wb, struct wb_writeback_work *work) { long pages; @@ -1913,7 +1908,8 @@ static long writeback_chunk_size(struct bdi_writeback *wb, pages = min(wb->avg_write_bandwidth / 2, global_wb_domain.dirty_limit / DIRTY_SCOPE); pages = min(pages, work->nr_pages); - return round_down(pages + MIN_WRITEBACK_PAGES, MIN_WRITEBACK_PAGES); + return round_down(pages + sb->s_min_writeback_pages, + sb->s_min_writeback_pages); } /* @@ -2015,7 +2011,7 @@ static long writeback_sb_inodes(struct super_block *sb, inode->i_state |= I_SYNC; wbc_attach_and_unlock_inode(&wbc, inode); - write_chunk = writeback_chunk_size(wb, work); + write_chunk = writeback_chunk_size(inode->i_sb, wb, work); wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; diff --git a/fs/super.c b/fs/super.c index 5bab94fb7e03..599c1d2641fe 100644 --- a/fs/super.c +++ b/fs/super.c @@ -389,6 +389,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, goto fail; if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink)) goto fail; + s->s_min_writeback_pages = MIN_WRITEBACK_PAGES; return s; fail: diff --git a/include/linux/fs.h b/include/linux/fs.h index a5dbfa20f8d7..6bf369095d2e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1583,6 +1583,7 @@ struct super_block { spinlock_t s_inode_wblist_lock; struct list_head s_inodes_wb; /* writeback inodes */ + long s_min_writeback_pages; } __randomize_layout; static inline struct user_namespace *i_user_ns(const struct inode *inode) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 22dd4adc5667..49e1dd96f43e 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -374,4 +374,9 @@ bool redirty_page_for_writepage(struct writeback_control *, struct page *); void sb_mark_inode_writeback(struct inode *inode); void sb_clear_inode_writeback(struct inode *inode); +/* + * 4MB minimal write chunk size + */ +#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10)) + #endif /* WRITEBACK_H */ -- cgit v1.2.3 From 4952f35f0545f3b53dab8d5fd727c4827c2a2778 Mon Sep 17 00:00:00 2001 From: Julian Sun Date: Mon, 29 Sep 2025 19:13:49 +0800 Subject: fs: Make wbc_to_tag() inline and use it in fs. The logic in wbc_to_tag() is widely used in file systems, so modify this function to be inline and use it in file systems. This patch has only passed compilation tests, but it should be fine. Signed-off-by: Julian Sun Reviewed-by: Qu Wenruo Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/btrfs/extent_io.c | 5 +---- fs/ceph/addr.c | 6 +----- fs/ext4/inode.c | 5 +---- fs/f2fs/data.c | 5 +---- fs/gfs2/aops.c | 5 +---- include/linux/writeback.h | 7 +++++++ mm/page-writeback.c | 6 ------ 7 files changed, 12 insertions(+), 27 deletions(-) (limited to 'include/linux/writeback.h') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c123a3ef154a..170dd7e80d11 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2460,10 +2460,7 @@ static int extent_write_cache_pages(struct address_space *mapping, &BTRFS_I(inode)->runtime_flags)) wbc->tagged_writepages = 1; - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; + tag = wbc_to_tag(wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 322ed268f14a..63b75d214210 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1045,11 +1045,7 @@ void ceph_init_writeback_ctl(struct address_space *mapping, ceph_wbc->index = ceph_wbc->start_index; ceph_wbc->end = -1; - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) { - ceph_wbc->tag = PAGECACHE_TAG_TOWRITE; - } else { - ceph_wbc->tag = PAGECACHE_TAG_DIRTY; - } + ceph_wbc->tag = wbc_to_tag(wbc); ceph_wbc->op_idx = -1; ceph_wbc->num_ops = 0; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f9e4ac87211e..58d6194045e2 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2619,10 +2619,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) handle_t *handle = NULL; int bpp = ext4_journal_blocks_per_folio(mpd->inode); - if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; + tag = wbc_to_tag(mpd->wbc); mpd->map.m_len = 0; mpd->next_pos = mpd->start_pos; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ef38e62cda8f..826bcfb8230c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2986,10 +2986,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; } - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; + tag = wbc_to_tag(wbc); retry: retry = 0; if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 47d74afd63ac..12394fc5dd29 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -311,10 +311,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping, range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ } - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; + tag = wbc_to_tag(wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 49e1dd96f43e..2a81816f7507 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -196,6 +196,13 @@ static inline void wait_on_inode(struct inode *inode) !(READ_ONCE(inode->i_state) & I_NEW)); } +static inline xa_mark_t wbc_to_tag(struct writeback_control *wbc) +{ + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + return PAGECACHE_TAG_TOWRITE; + return PAGECACHE_TAG_DIRTY; +} + #ifdef CONFIG_CGROUP_WRITEBACK #include diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 757bc4d3b5b5..a124ab6a205d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2434,12 +2434,6 @@ static bool folio_prepare_writeback(struct address_space *mapping, return true; } -static xa_mark_t wbc_to_tag(struct writeback_control *wbc) -{ - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - return PAGECACHE_TAG_TOWRITE; - return PAGECACHE_TAG_DIRTY; -} static pgoff_t wbc_end(struct writeback_control *wbc) { -- cgit v1.2.3 From 64dd89ae01f2708a508e028c28b7906e4702a9a7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 15 Dec 2025 12:57:53 -0500 Subject: mm/block/fs: remove laptop_mode Laptop mode was introduced to save battery, by delaying and consolidating writes and thereby maximize the time rotating hard drives wouldn't have to spin. Luckily, rotating hard drives, with their high spin-up times and power draw, are a thing of the past for battery-powered devices. Reclaim has also since changed to not write single filesystem pages anymore, and regular filesystem writeback is lumpy by design. The juice doesn't appear worth the squeeze anymore. The footprint of the feature is small, but nevertheless it's a complicating factor in mm, block, filesystems. Developers don't think about it, and it likely hasn't been tested with new reclaim and writeback changes in years. Let's sunset it. Keep the sysctl with a deprecation warning around for a few more cycles, but remove all functionality behind it. [akpm@linux-foundation.org: fix Documentation/admin-guide/laptops/index.rst] Link: https://lkml.kernel.org/r/20251216185201.GH905277@cmpxchg.org Signed-off-by: Johannes Weiner Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Acked-by: Jens Axboe Reviewed-by: Shakeel Butt Acked-by: Michal Hocko Cc: Deepanshu Kartikey Signed-off-by: Andrew Morton --- Documentation/admin-guide/laptops/index.rst | 1 - Documentation/admin-guide/laptops/laptop-mode.rst | 770 ---------------------- Documentation/admin-guide/sysctl/vm.rst | 8 - block/blk-mq.c | 3 - fs/ext4/inode.c | 3 +- fs/sync.c | 2 - fs/xfs/xfs_super.c | 9 - include/linux/backing-dev-defs.h | 3 - include/linux/writeback.h | 4 - include/trace/events/writeback.h | 1 - include/uapi/linux/sysctl.h | 2 +- mm/backing-dev.c | 3 - mm/page-writeback.c | 74 +-- mm/vmscan.c | 30 +- 14 files changed, 25 insertions(+), 888 deletions(-) delete mode 100644 Documentation/admin-guide/laptops/laptop-mode.rst (limited to 'include/linux/writeback.h') diff --git a/Documentation/admin-guide/laptops/index.rst b/Documentation/admin-guide/laptops/index.rst index 6432c251dc95..c0b911d05c59 100644 --- a/Documentation/admin-guide/laptops/index.rst +++ b/Documentation/admin-guide/laptops/index.rst @@ -10,7 +10,6 @@ Laptop Drivers alienware-wmi asus-laptop disk-shock-protection - laptop-mode lg-laptop samsung-galaxybook sony-laptop diff --git a/Documentation/admin-guide/laptops/laptop-mode.rst b/Documentation/admin-guide/laptops/laptop-mode.rst deleted file mode 100644 index 66eb9cd918b5..000000000000 --- a/Documentation/admin-guide/laptops/laptop-mode.rst +++ /dev/null @@ -1,770 +0,0 @@ -=============================================== -How to conserve battery power using laptop-mode -=============================================== - -Document Author: Bart Samwel (bart@samwel.tk) - -Date created: January 2, 2004 - -Last modified: December 06, 2004 - -Introduction ------------- - -Laptop mode is used to minimize the time that the hard disk needs to be spun up, -to conserve battery power on laptops. It has been reported to cause significant -power savings. - -.. Contents - - * Introduction - * Installation - * Caveats - * The Details - * Tips & Tricks - * Control script - * ACPI integration - * Monitoring tool - - -Installation ------------- - -To use laptop mode, you don't need to set any kernel configuration options -or anything. Simply install all the files included in this document, and -laptop mode will automatically be started when you're on battery. For -your convenience, a tarball containing an installer can be downloaded at: - - http://www.samwel.tk/laptop_mode/laptop_mode/ - -To configure laptop mode, you need to edit the configuration file, which is -located in /etc/default/laptop-mode on Debian-based systems, or in -/etc/sysconfig/laptop-mode on other systems. - -Unfortunately, automatic enabling of laptop mode does not work for -laptops that don't have ACPI. On those laptops, you need to start laptop -mode manually. To start laptop mode, run "laptop_mode start", and to -stop it, run "laptop_mode stop". (Note: The laptop mode tools package now -has experimental support for APM, you might want to try that first.) - - -Caveats -------- - -* The downside of laptop mode is that you have a chance of losing up to 10 - minutes of work. If you cannot afford this, don't use it! The supplied ACPI - scripts automatically turn off laptop mode when the battery almost runs out, - so that you won't lose any data at the end of your battery life. - -* Most desktop hard drives have a very limited lifetime measured in spindown - cycles, typically about 50.000 times (it's usually listed on the spec sheet). - Check your drive's rating, and don't wear down your drive's lifetime if you - don't need to. - -* If you mount some of your ext3 filesystems with the -n option, then - the control script will not be able to remount them correctly. You must set - DO_REMOUNTS=0 in the control script, otherwise it will remount them with the - wrong options -- or it will fail because it cannot write to /etc/mtab. - -* If you have your filesystems listed as type "auto" in fstab, like I did, then - the control script will not recognize them as filesystems that need remounting. - You must list the filesystems with their true type instead. - -* It has been reported that some versions of the mutt mail client use file access - times to determine whether a folder contains new mail. If you use mutt and - experience this, you must disable the noatime remounting by setting the option - DO_REMOUNT_NOATIME to 0 in the configuration file. - - -The Details ------------ - -Laptop mode is controlled by the knob /proc/sys/vm/laptop_mode. This knob is -present for all kernels that have the laptop mode patch, regardless of any -configuration options. When the knob is set, any physical disk I/O (that might -have caused the hard disk to spin up) causes Linux to flush all dirty blocks. The -result of this is that after a disk has spun down, it will not be spun up -anymore to write dirty blocks, because those blocks had already been written -immediately after the most recent read operation. The value of the laptop_mode -knob determines the time between the occurrence of disk I/O and when the flush -is triggered. A sensible value for the knob is 5 seconds. Setting the knob to -0 disables laptop mode. - -To increase the effectiveness of the laptop_mode strategy, the laptop_mode -control script increases dirty_expire_centisecs and dirty_writeback_centisecs in -/proc/sys/vm to about 10 minutes (by default), which means that pages that are -dirtied are not forced to be written to disk as often. The control script also -changes the dirty background ratio, so that background writeback of dirty pages -is not done anymore. Combined with a higher commit value (also 10 minutes) for -ext3 filesystem (also done automatically by the control script), -this results in concentration of disk activity in a small time interval which -occurs only once every 10 minutes, or whenever the disk is forced to spin up by -a cache miss. The disk can then be spun down in the periods of inactivity. - - -Configuration -------------- - -The laptop mode configuration file is located in /etc/default/laptop-mode on -Debian-based systems, or in /etc/sysconfig/laptop-mode on other systems. It -contains the following options: - -MAX_AGE: - -Maximum time, in seconds, of hard drive spindown time that you are -comfortable with. Worst case, it's possible that you could lose this -amount of work if your battery fails while you're in laptop mode. - -MINIMUM_BATTERY_MINUTES: - -Automatically disable laptop mode if the remaining number of minutes of -battery power is less than this value. Default is 10 minutes. - -AC_HD/BATT_HD: - -The idle timeout that should be set on your hard drive when laptop mode -is active (BATT_HD) and when it is not active (AC_HD). The defaults are -20 seconds (value 4) for BATT_HD and 2 hours (value 244) for AC_HD. The -possible values are those listed in the manual page for "hdparm" for the -"-S" option. - -HD: - -The devices for which the spindown timeout should be adjusted by laptop mode. -Default is /dev/hda. If you specify multiple devices, separate them by a space. - -READAHEAD: - -Disk readahead, in 512-byte sectors, while laptop mode is active. A large -readahead can prevent disk accesses for things like executable pages (which are -loaded on demand while the application executes) and sequentially accessed data -(MP3s). - -DO_REMOUNTS: - -The control script automatically remounts any mounted journaled filesystems -with appropriate commit interval options. When this option is set to 0, this -feature is disabled. - -DO_REMOUNT_NOATIME: - -When remounting, should the filesystems be remounted with the noatime option? -Normally, this is set to "1" (enabled), but there may be programs that require -access time recording. - -DIRTY_RATIO: - -The percentage of memory that is allowed to contain "dirty" or unsaved data -before a writeback is forced, while laptop mode is active. Corresponds to -the /proc/sys/vm/dirty_ratio sysctl. - -DIRTY_BACKGROUND_RATIO: - -The percentage of memory that is allowed to contain "dirty" or unsaved data -after a forced writeback is done due to an exceeding of DIRTY_RATIO. Set -this nice and low. This corresponds to the /proc/sys/vm/dirty_background_ratio -sysctl. - -Note that the behaviour of dirty_background_ratio is quite different -when laptop mode is active and when it isn't. When laptop mode is inactive, -dirty_background_ratio is the threshold percentage at which background writeouts -start taking place. When laptop mode is active, however, background writeouts -are disabled, and the dirty_background_ratio only determines how much writeback -is done when dirty_ratio is reached. - -DO_CPU: - -Enable CPU frequency scaling when in laptop mode. (Requires CPUFreq to be setup. -See Documentation/admin-guide/pm/cpufreq.rst for more info. Disabled by default.) - -CPU_MAXFREQ: - -When on battery, what is the maximum CPU speed that the system should use? Legal -values are "slowest" for the slowest speed that your CPU is able to operate at, -or a value listed in /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies. - - -Tips & Tricks -------------- - -* Bartek Kania reports getting up to 50 minutes of extra battery life (on top - of his regular 3 to 3.5 hours) using a spindown time of 5 seconds (BATT_HD=1). - -* You can spin down the disk while playing MP3, by setting disk readahead - to 8MB (READAHEAD=16384). Effectively, the disk will read a complete MP3 at - once, and will then spin down while the MP3 is playing. (Thanks to Bartek - Kania.) - -* Drew Scott Daniels observed: "I don't know why, but when I decrease the number - of colours that my display uses it consumes less battery power. I've seen - this on powerbooks too. I hope that this is a piece of information that - might be useful to the Laptop Mode patch or its users." - -* In syslog.conf, you can prefix entries with a dash `-` to omit syncing the - file after every logging. When you're using laptop-mode and your disk doesn't - spin down, this is a likely culprit. - -* Richard Atterer observed that laptop mode does not work well with noflushd - (http://noflushd.sourceforge.net/), it seems that noflushd prevents laptop-mode - from doing its thing. - -* If you're worried about your data, you might want to consider using a USB - memory stick or something like that as a "working area". (Be aware though - that flash memory can only handle a limited number of writes, and overuse - may wear out your memory stick pretty quickly. Do _not_ use journalling - filesystems on flash memory sticks.) - - -Configuration file for control and ACPI battery scripts -------------------------------------------------------- - -This allows the tunables to be changed for the scripts via an external -configuration file - -It should be installed as /etc/default/laptop-mode on Debian, and as -/etc/sysconfig/laptop-mode on Red Hat, SUSE, Mandrake, and other work-alikes. - -Config file:: - - # Maximum time, in seconds, of hard drive spindown time that you are - # comfortable with. Worst case, it's possible that you could lose this - # amount of work if your battery fails you while in laptop mode. - #MAX_AGE=600 - - # Automatically disable laptop mode when the number of minutes of battery - # that you have left goes below this threshold. - MINIMUM_BATTERY_MINUTES=10 - - # Read-ahead, in 512-byte sectors. You can spin down the disk while playing MP3/OGG - # by setting the disk readahead to 8MB (READAHEAD=16384). Effectively, the disk - # will read a complete MP3 at once, and will then spin down while the MP3/OGG is - # playing. - #READAHEAD=4096 - - # Shall we remount journaled fs. with appropriate commit interval? (1=yes) - #DO_REMOUNTS=1 - - # And shall we add the "noatime" option to that as well? (1=yes) - #DO_REMOUNT_NOATIME=1 - - # Dirty synchronous ratio. At this percentage of dirty pages the process - # which - # calls write() does its own writeback - #DIRTY_RATIO=40 - - # - # Allowed dirty background ratio, in percent. Once DIRTY_RATIO has been - # exceeded, the kernel will wake flusher threads which will then reduce the - # amount of dirty memory to dirty_background_ratio. Set this nice and low, - # so once some writeout has commenced, we do a lot of it. - # - #DIRTY_BACKGROUND_RATIO=5 - - # kernel default dirty buffer age - #DEF_AGE=30 - #DEF_UPDATE=5 - #DEF_DIRTY_BACKGROUND_RATIO=10 - #DEF_DIRTY_RATIO=40 - #DEF_XFS_AGE_BUFFER=15 - #DEF_XFS_SYNC_INTERVAL=30 - #DEF_XFS_BUFD_INTERVAL=1 - - # This must be adjusted manually to the value of HZ in the running kernel - # on 2.4, until the XFS people change their 2.4 external interfaces to work in - # centisecs. This can be automated, but it's a work in progress that still - # needs# some fixes. On 2.6 kernels, XFS uses USER_HZ instead of HZ for - # external interfaces, and that is currently always set to 100. So you don't - # need to change this on 2.6. - #XFS_HZ=100 - - # Should the maximum CPU frequency be adjusted down while on battery? - # Requires CPUFreq to be setup. - # See Documentation/admin-guide/pm/cpufreq.rst for more info - #DO_CPU=0 - - # When on battery what is the maximum CPU speed that the system should - # use? Legal values are "slowest" for the slowest speed that your - # CPU is able to operate at, or a value listed in: - # /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies - # Only applicable if DO_CPU=1. - #CPU_MAXFREQ=slowest - - # Idle timeout for your hard drive (man hdparm for valid values, -S option) - # Default is 2 hours on AC (AC_HD=244) and 20 seconds for battery (BATT_HD=4). - #AC_HD=244 - #BATT_HD=4 - - # The drives for which to adjust the idle timeout. Separate them by a space, - # e.g. HD="/dev/hda /dev/hdb". - #HD="/dev/hda" - - # Set the spindown timeout on a hard drive? - #DO_HD=1 - - -Control script --------------- - -Please note that this control script works for the Linux 2.4 and 2.6 series (thanks -to Kiko Piris). - -Control script:: - - #!/bin/bash - - # start or stop laptop_mode, best run by a power management daemon when - # ac gets connected/disconnected from a laptop - # - # install as /sbin/laptop_mode - # - # Contributors to this script: Kiko Piris - # Bart Samwel - # Micha Feigin - # Andrew Morton - # Herve Eychenne - # Dax Kelson - # - # Original Linux 2.4 version by: Jens Axboe - - ############################################################################# - - # Source config - if [ -f /etc/default/laptop-mode ] ; then - # Debian - . /etc/default/laptop-mode - elif [ -f /etc/sysconfig/laptop-mode ] ; then - # Others - . /etc/sysconfig/laptop-mode - fi - - # Don't raise an error if the config file is incomplete - # set defaults instead: - - # Maximum time, in seconds, of hard drive spindown time that you are - # comfortable with. Worst case, it's possible that you could lose this - # amount of work if your battery fails you while in laptop mode. - MAX_AGE=${MAX_AGE:-'600'} - - # Read-ahead, in kilobytes - READAHEAD=${READAHEAD:-'4096'} - - # Shall we remount journaled fs. with appropriate commit interval? (1=yes) - DO_REMOUNTS=${DO_REMOUNTS:-'1'} - - # And shall we add the "noatime" option to that as well? (1=yes) - DO_REMOUNT_NOATIME=${DO_REMOUNT_NOATIME:-'1'} - - # Shall we adjust the idle timeout on a hard drive? - DO_HD=${DO_HD:-'1'} - - # Adjust idle timeout on which hard drive? - HD="${HD:-'/dev/hda'}" - - # spindown time for HD (hdparm -S values) - AC_HD=${AC_HD:-'244'} - BATT_HD=${BATT_HD:-'4'} - - # Dirty synchronous ratio. At this percentage of dirty pages the process which - # calls write() does its own writeback - DIRTY_RATIO=${DIRTY_RATIO:-'40'} - - # cpu frequency scaling - # See Documentation/admin-guide/pm/cpufreq.rst for more info - DO_CPU=${CPU_MANAGE:-'0'} - CPU_MAXFREQ=${CPU_MAXFREQ:-'slowest'} - - # - # Allowed dirty background ratio, in percent. Once DIRTY_RATIO has been - # exceeded, the kernel will wake flusher threads which will then reduce the - # amount of dirty memory to dirty_background_ratio. Set this nice and low, - # so once some writeout has commenced, we do a lot of it. - # - DIRTY_BACKGROUND_RATIO=${DIRTY_BACKGROUND_RATIO:-'5'} - - # kernel default dirty buffer age - DEF_AGE=${DEF_AGE:-'30'} - DEF_UPDATE=${DEF_UPDATE:-'5'} - DEF_DIRTY_BACKGROUND_RATIO=${DEF_DIRTY_BACKGROUND_RATIO:-'10'} - DEF_DIRTY_RATIO=${DEF_DIRTY_RATIO:-'40'} - DEF_XFS_AGE_BUFFER=${DEF_XFS_AGE_BUFFER:-'15'} - DEF_XFS_SYNC_INTERVAL=${DEF_XFS_SYNC_INTERVAL:-'30'} - DEF_XFS_BUFD_INTERVAL=${DEF_XFS_BUFD_INTERVAL:-'1'} - - # This must be adjusted manually to the value of HZ in the running kernel - # on 2.4, until the XFS people change their 2.4 external interfaces to work in - # centisecs. This can be automated, but it's a work in progress that still needs - # some fixes. On 2.6 kernels, XFS uses USER_HZ instead of HZ for external - # interfaces, and that is currently always set to 100. So you don't need to - # change this on 2.6. - XFS_HZ=${XFS_HZ:-'100'} - - ############################################################################# - - KLEVEL="$(uname -r | - { - IFS='.' read a b c - echo $a.$b - } - )" - case "$KLEVEL" in - "2.4"|"2.6") - ;; - *) - echo "Unhandled kernel version: $KLEVEL ('uname -r' = '$(uname -r)')" >&2 - exit 1 - ;; - esac - - if [ ! -e /proc/sys/vm/laptop_mode ] ; then - echo "Kernel is not patched with laptop_mode patch." >&2 - exit 1 - fi - - if [ ! -w /proc/sys/vm/laptop_mode ] ; then - echo "You do not have enough privileges to enable laptop_mode." >&2 - exit 1 - fi - - # Remove an option (the first parameter) of the form option= from - # a mount options string (the rest of the parameters). - parse_mount_opts () { - OPT="$1" - shift - echo ",$*," | sed \ - -e 's/,'"$OPT"'=[0-9]*,/,/g' \ - -e 's/,,*/,/g' \ - -e 's/^,//' \ - -e 's/,$//' - } - - # Remove an option (the first parameter) without any arguments from - # a mount option string (the rest of the parameters). - parse_nonumber_mount_opts () { - OPT="$1" - shift - echo ",$*," | sed \ - -e 's/,'"$OPT"',/,/g' \ - -e 's/,,*/,/g' \ - -e 's/^,//' \ - -e 's/,$//' - } - - # Find out the state of a yes/no option (e.g. "atime"/"noatime") in - # fstab for a given filesystem, and use this state to replace the - # value of the option in another mount options string. The device - # is the first argument, the option name the second, and the default - # value the third. The remainder is the mount options string. - # - # Example: - # parse_yesno_opts_wfstab /dev/hda1 atime atime defaults,noatime - # - # If fstab contains, say, "rw" for this filesystem, then the result - # will be "defaults,atime". - parse_yesno_opts_wfstab () { - L_DEV="$1" - OPT="$2" - DEF_OPT="$3" - shift 3 - L_OPTS="$*" - PARSEDOPTS1="$(parse_nonumber_mount_opts $OPT $L_OPTS)" - PARSEDOPTS1="$(parse_nonumber_mount_opts no$OPT $PARSEDOPTS1)" - # Watch for a default atime in fstab - FSTAB_OPTS="$(awk '$1 == "'$L_DEV'" { print $4 }' /etc/fstab)" - if echo "$FSTAB_OPTS" | grep "$OPT" > /dev/null ; then - # option specified in fstab: extract the value and use it - if echo "$FSTAB_OPTS" | grep "no$OPT" > /dev/null ; then - echo "$PARSEDOPTS1,no$OPT" - else - # no$OPT not found -- so we must have $OPT. - echo "$PARSEDOPTS1,$OPT" - fi - else - # option not specified in fstab -- choose the default. - echo "$PARSEDOPTS1,$DEF_OPT" - fi - } - - # Find out the state of a numbered option (e.g. "commit=NNN") in - # fstab for a given filesystem, and use this state to replace the - # value of the option in another mount options string. The device - # is the first argument, and the option name the second. The - # remainder is the mount options string in which the replacement - # must be done. - # - # Example: - # parse_mount_opts_wfstab /dev/hda1 commit defaults,commit=7 - # - # If fstab contains, say, "commit=3,rw" for this filesystem, then the - # result will be "rw,commit=3". - parse_mount_opts_wfstab () { - L_DEV="$1" - OPT="$2" - shift 2 - L_OPTS="$*" - PARSEDOPTS1="$(parse_mount_opts $OPT $L_OPTS)" - # Watch for a default commit in fstab - FSTAB_OPTS="$(awk '$1 == "'$L_DEV'" { print $4 }' /etc/fstab)" - if echo "$FSTAB_OPTS" | grep "$OPT=" > /dev/null ; then - # option specified in fstab: extract the value, and use it - echo -n "$PARSEDOPTS1,$OPT=" - echo ",$FSTAB_OPTS," | sed \ - -e 's/.*,'"$OPT"'=//' \ - -e 's/,.*//' - else - # option not specified in fstab: set it to 0 - echo "$PARSEDOPTS1,$OPT=0" - fi - } - - deduce_fstype () { - MP="$1" - # My root filesystem unfortunately has - # type "unknown" in /etc/mtab. If we encounter - # "unknown", we try to get the type from fstab. - cat /etc/fstab | - grep -v '^#' | - while read FSTAB_DEV FSTAB_MP FSTAB_FST FSTAB_OPTS FSTAB_DUMP FSTAB_DUMP ; do - if [ "$FSTAB_MP" = "$MP" ]; then - echo $FSTAB_FST - exit 0 - fi - done - } - - if [ $DO_REMOUNT_NOATIME -eq 1 ] ; then - NOATIME_OPT=",noatime" - fi - - case "$1" in - start) - AGE=$((100*$MAX_AGE)) - XFS_AGE=$(($XFS_HZ*$MAX_AGE)) - echo -n "Starting laptop_mode" - - if [ -d /proc/sys/vm/pagebuf ] ; then - # (For 2.4 and early 2.6.) - # This only needs to be set, not reset -- it is only used when - # laptop mode is enabled. - echo $XFS_AGE > /proc/sys/vm/pagebuf/lm_flush_age - echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval - elif [ -f /proc/sys/fs/xfs/lm_age_buffer ] ; then - # (A couple of early 2.6 laptop mode patches had these.) - # The same goes for these. - echo $XFS_AGE > /proc/sys/fs/xfs/lm_age_buffer - echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval - elif [ -f /proc/sys/fs/xfs/age_buffer ] ; then - # (2.6.6) - # But not for these -- they are also used in normal - # operation. - echo $XFS_AGE > /proc/sys/fs/xfs/age_buffer - echo $XFS_AGE > /proc/sys/fs/xfs/sync_interval - elif [ -f /proc/sys/fs/xfs/age_buffer_centisecs ] ; then - # (2.6.7 upwards) - # And not for these either. These are in centisecs, - # not USER_HZ, so we have to use $AGE, not $XFS_AGE. - echo $AGE > /proc/sys/fs/xfs/age_buffer_centisecs - echo $AGE > /proc/sys/fs/xfs/xfssyncd_centisecs - echo 3000 > /proc/sys/fs/xfs/xfsbufd_centisecs - fi - - case "$KLEVEL" in - "2.4") - echo 1 > /proc/sys/vm/laptop_mode - echo "30 500 0 0 $AGE $AGE 60 20 0" > /proc/sys/vm/bdflush - ;; - "2.6") - echo 5 > /proc/sys/vm/laptop_mode - echo "$AGE" > /proc/sys/vm/dirty_writeback_centisecs - echo "$AGE" > /proc/sys/vm/dirty_expire_centisecs - echo "$DIRTY_RATIO" > /proc/sys/vm/dirty_ratio - echo "$DIRTY_BACKGROUND_RATIO" > /proc/sys/vm/dirty_background_ratio - ;; - esac - if [ $DO_REMOUNTS -eq 1 ]; then - cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do - PARSEDOPTS="$(parse_mount_opts "$OPTS")" - if [ "$FST" = 'unknown' ]; then - FST=$(deduce_fstype $MP) - fi - case "$FST" in - "ext3") - PARSEDOPTS="$(parse_mount_opts commit "$OPTS")" - mount $DEV -t $FST $MP -o remount,$PARSEDOPTS,commit=$MAX_AGE$NOATIME_OPT - ;; - "xfs") - mount $DEV -t $FST $MP -o remount,$OPTS$NOATIME_OPT - ;; - esac - if [ -b $DEV ] ; then - blockdev --setra $(($READAHEAD * 2)) $DEV - fi - done - fi - if [ $DO_HD -eq 1 ] ; then - for THISHD in $HD ; do - /sbin/hdparm -S $BATT_HD $THISHD > /dev/null 2>&1 - /sbin/hdparm -B 1 $THISHD > /dev/null 2>&1 - done - fi - if [ $DO_CPU -eq 1 -a -e /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq ]; then - if [ $CPU_MAXFREQ = 'slowest' ]; then - CPU_MAXFREQ=`cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq` - fi - echo $CPU_MAXFREQ > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq - fi - echo "." - ;; - stop) - U_AGE=$((100*$DEF_UPDATE)) - B_AGE=$((100*$DEF_AGE)) - echo -n "Stopping laptop_mode" - echo 0 > /proc/sys/vm/laptop_mode - if [ -f /proc/sys/fs/xfs/age_buffer -a ! -f /proc/sys/fs/xfs/lm_age_buffer ] ; then - # These need to be restored, if there are no lm_*. - echo $(($XFS_HZ*$DEF_XFS_AGE_BUFFER)) > /proc/sys/fs/xfs/age_buffer - echo $(($XFS_HZ*$DEF_XFS_SYNC_INTERVAL)) > /proc/sys/fs/xfs/sync_interval - elif [ -f /proc/sys/fs/xfs/age_buffer_centisecs ] ; then - # These need to be restored as well. - echo $((100*$DEF_XFS_AGE_BUFFER)) > /proc/sys/fs/xfs/age_buffer_centisecs - echo $((100*$DEF_XFS_SYNC_INTERVAL)) > /proc/sys/fs/xfs/xfssyncd_centisecs - echo $((100*$DEF_XFS_BUFD_INTERVAL)) > /proc/sys/fs/xfs/xfsbufd_centisecs - fi - case "$KLEVEL" in - "2.4") - echo "30 500 0 0 $U_AGE $B_AGE 60 20 0" > /proc/sys/vm/bdflush - ;; - "2.6") - echo "$U_AGE" > /proc/sys/vm/dirty_writeback_centisecs - echo "$B_AGE" > /proc/sys/vm/dirty_expire_centisecs - echo "$DEF_DIRTY_RATIO" > /proc/sys/vm/dirty_ratio - echo "$DEF_DIRTY_BACKGROUND_RATIO" > /proc/sys/vm/dirty_background_ratio - ;; - esac - if [ $DO_REMOUNTS -eq 1 ] ; then - cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do - # Reset commit and atime options to defaults. - if [ "$FST" = 'unknown' ]; then - FST=$(deduce_fstype $MP) - fi - case "$FST" in - "ext3") - PARSEDOPTS="$(parse_mount_opts_wfstab $DEV commit $OPTS)" - PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $PARSEDOPTS)" - mount $DEV -t $FST $MP -o remount,$PARSEDOPTS - ;; - "xfs") - PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $OPTS)" - mount $DEV -t $FST $MP -o remount,$PARSEDOPTS - ;; - esac - if [ -b $DEV ] ; then - blockdev --setra 256 $DEV - fi - done - fi - if [ $DO_HD -eq 1 ] ; then - for THISHD in $HD ; do - /sbin/hdparm -S $AC_HD $THISHD > /dev/null 2>&1 - /sbin/hdparm -B 255 $THISHD > /dev/null 2>&1 - done - fi - if [ $DO_CPU -eq 1 -a -e /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq ]; then - echo `cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq` > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq - fi - echo "." - ;; - *) - echo "Usage: $0 {start|stop}" 2>&1 - exit 1 - ;; - - esac - - exit 0 - - -ACPI integration ----------------- - -Dax Kelson submitted this so that the ACPI acpid daemon will -kick off the laptop_mode script and run hdparm. The part that -automatically disables laptop mode when the battery is low was -written by Jan Topinski. - -/etc/acpi/events/ac_adapter:: - - event=ac_adapter - action=/etc/acpi/actions/ac.sh %e - -/etc/acpi/events/battery:: - - event=battery.* - action=/etc/acpi/actions/battery.sh %e - -/etc/acpi/actions/ac.sh:: - - #!/bin/bash - - # ac on/offline event handler - - status=`awk '/^state: / { print $2 }' /proc/acpi/ac_adapter/$2/state` - - case $status in - "on-line") - /sbin/laptop_mode stop - exit 0 - ;; - "off-line") - /sbin/laptop_mode start - exit 0 - ;; - esac - - -/etc/acpi/actions/battery.sh:: - - #! /bin/bash - - # Automatically disable laptop mode when the battery almost runs out. - - BATT_INFO=/proc/acpi/battery/$2/state - - if [[ -f /proc/sys/vm/laptop_mode ]] - then - LM=`cat /proc/sys/vm/laptop_mode` - if [[ $LM -gt 0 ]] - then - if [[ -f $BATT_INFO ]] - then - # Source the config file only now that we know we need - if [ -f /etc/default/laptop-mode ] ; then - # Debian - . /etc/default/laptop-mode - elif [ -f /etc/sysconfig/laptop-mode ] ; then - # Others - . /etc/sysconfig/laptop-mode - fi - MINIMUM_BATTERY_MINUTES=${MINIMUM_BATTERY_MINUTES:-'10'} - - ACTION="`cat $BATT_INFO | grep charging | cut -c 26-`" - if [[ ACTION -eq "discharging" ]] - then - PRESENT_RATE=`cat $BATT_INFO | grep "present rate:" | sed "s/.* \([0-9][0-9]* \).*/\1/" ` - REMAINING=`cat $BATT_INFO | grep "remaining capacity:" | sed "s/.* \([0-9][0-9]* \).*/\1/" ` - fi - if (($REMAINING * 60 / $PRESENT_RATE < $MINIMUM_BATTERY_MINUTES)) - then - /sbin/laptop_mode stop - fi - else - logger -p daemon.warning "You are using laptop mode and your battery interface $BATT_INFO is missing. This may lead to loss of data when the battery runs out. Check kernel ACPI support and /proc/acpi/battery folder, and edit /etc/acpi/battery.sh to set BATT_INFO to the correct path." - fi - fi - fi - - -Monitoring tool ---------------- - -Bartek Kania submitted this, it can be used to measure how much time your disk -spends spun up/down. See tools/laptop/dslm/dslm.c diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 245bf6394935..ca6ebeb5171c 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -41,7 +41,6 @@ Currently, these files are in /proc/sys/vm: - extfrag_threshold - highmem_is_dirtyable - hugetlb_shm_group -- laptop_mode - legacy_va_layout - lowmem_reserve_ratio - max_map_count @@ -363,13 +362,6 @@ hugetlb_shm_group contains group id that is allowed to create SysV shared memory segment using hugetlb page. -laptop_mode -=========== - -laptop_mode is a knob that controls "laptop mode". All the things that are -controlled by this knob are discussed in Documentation/admin-guide/laptops/laptop-mode.rst. - - legacy_va_layout ================ diff --git a/block/blk-mq.c b/block/blk-mq.c index a29d8ac9d3e3..4bae7c4c664e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -811,9 +811,6 @@ void blk_mq_free_request(struct request *rq) blk_mq_finish_request(rq); - if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) - laptop_io_completion(q->disk->bdi); - rq_qos_done(q, rq); WRITE_ONCE(rq->state, MQ_RQ_IDLE); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0c466ccbed69..15eb463d5a9b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3305,8 +3305,7 @@ int ext4_alloc_da_blocks(struct inode *inode) /* * We do something simple for now. The filemap_flush() will * also start triggering a write of the data blocks, which is - * not strictly speaking necessary (and for users of - * laptop_mode, not even desirable). However, to do otherwise + * not strictly speaking necessary. However, to do otherwise * would require replicating code paths in: * * ext4_writepages() -> diff --git a/fs/sync.c b/fs/sync.c index 431fc5f5be06..6330150792f6 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -104,8 +104,6 @@ void ksys_sync(void) iterate_supers(sync_fs_one_sb, &wait); sync_bdevs(false); sync_bdevs(true); - if (unlikely(laptop_mode)) - laptop_sync_completion(); } SYSCALL_DEFINE0(sync) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index bc71aa9dcee8..a2014fb1bc66 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -845,15 +845,6 @@ xfs_fs_sync_fs( if (error) return error; - if (laptop_mode) { - /* - * The disk must be active because we're syncing. - * We schedule log work now (now that the disk is - * active) instead of later (when it might not be). - */ - flush_delayed_work(&mp->m_log->l_work); - } - /* * If we are called with page faults frozen out, it means we are about * to freeze the transaction subsystem. Take the opportunity to shut diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 0217c1073735..c88fd4d37d1f 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -46,7 +46,6 @@ enum wb_reason { WB_REASON_VMSCAN, WB_REASON_SYNC, WB_REASON_PERIODIC, - WB_REASON_LAPTOP_TIMER, WB_REASON_FS_FREE_SPACE, /* * There is no bdi forker thread any more and works are done @@ -204,8 +203,6 @@ struct backing_dev_info { char dev_name[64]; struct device *owner; - struct timer_list laptop_mode_wb_timer; - #ifdef CONFIG_DEBUG_FS struct dentry *debug_dir; #endif diff --git a/include/linux/writeback.h b/include/linux/writeback.h index f48e8ccffe81..e530112c4b3a 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -328,9 +328,6 @@ struct dirty_throttle_control { bool dirty_exceeded; }; -void laptop_io_completion(struct backing_dev_info *info); -void laptop_sync_completion(void); -void laptop_mode_timer_fn(struct timer_list *t); bool node_dirty_ok(struct pglist_data *pgdat); int wb_domain_init(struct wb_domain *dom, gfp_t gfp); #ifdef CONFIG_CGROUP_WRITEBACK @@ -342,7 +339,6 @@ extern struct wb_domain global_wb_domain; /* These are exported to sysctl. */ extern unsigned int dirty_writeback_interval; extern unsigned int dirty_expire_interval; -extern int laptop_mode; void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 311a341e6fe4..b6f94e97788a 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -42,7 +42,6 @@ EM( WB_REASON_VMSCAN, "vmscan") \ EM( WB_REASON_SYNC, "sync") \ EM( WB_REASON_PERIODIC, "periodic") \ - EM( WB_REASON_LAPTOP_TIMER, "laptop_timer") \ EM( WB_REASON_FS_FREE_SPACE, "fs_free_space") \ EM( WB_REASON_FORKER_THREAD, "forker_thread") \ EMe(WB_REASON_FOREIGN_FLUSH, "foreign_flush") diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index 63d1464cb71c..6ea9ea8413fa 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -183,7 +183,7 @@ enum VM_LOWMEM_RESERVE_RATIO=20,/* reservation ratio for lower memory zones */ VM_MIN_FREE_KBYTES=21, /* Minimum free kilobytes to maintain */ VM_MAX_MAP_COUNT=22, /* int: Maximum number of mmaps/address-space */ - VM_LAPTOP_MODE=23, /* vm laptop mode */ + VM_BLOCK_DUMP=24, /* block dump mode */ VM_HUGETLB_GROUP=25, /* permitted hugetlb group */ VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c5740c6d37a2..a0e26d1b717f 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1034,7 +1034,6 @@ struct backing_dev_info *bdi_alloc(int node_id) bdi->capabilities = BDI_CAP_WRITEBACK; bdi->ra_pages = VM_READAHEAD_PAGES; bdi->io_pages = VM_READAHEAD_PAGES; - timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0); return bdi; } EXPORT_SYMBOL(bdi_alloc); @@ -1156,8 +1155,6 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) void bdi_unregister(struct backing_dev_info *bdi) { - timer_delete_sync(&bdi->laptop_mode_wb_timer); - /* make sure nobody finds us on the bdi_list anymore */ bdi_remove_from_list(bdi); wb_shutdown(&bdi->wb); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index ccdeb0e84d39..601a5e048d12 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -109,14 +109,6 @@ EXPORT_SYMBOL_GPL(dirty_writeback_interval); */ unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */ -/* - * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: - * a full sync is triggered after this time elapses without any disk activity. - */ -int laptop_mode; - -EXPORT_SYMBOL(laptop_mode); - /* End of sysctl-exported parameters */ struct wb_domain global_wb_domain; @@ -1843,17 +1835,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb, balance_domain_limits(mdtc, strictlimit); } - /* - * In laptop mode, we wait until hitting the higher threshold - * before starting background writeout, and then write out all - * the way down to the lower threshold. So slow writers cause - * minimal disk activity. - * - * In normal mode, we start background writeout at the lower - * background_thresh, to keep the amount of dirty memory low. - */ - if (!laptop_mode && nr_dirty > gdtc->bg_thresh && - !writeback_in_progress(wb)) + if (nr_dirty > gdtc->bg_thresh && !writeback_in_progress(wb)) wb_start_background_writeback(wb); /* @@ -1876,10 +1858,6 @@ free_running: break; } - /* Start writeback even when in laptop mode */ - if (unlikely(!writeback_in_progress(wb))) - wb_start_background_writeback(wb); - mem_cgroup_flush_foreign(wb); /* @@ -2198,41 +2176,6 @@ static int dirty_writeback_centisecs_handler(const struct ctl_table *table, int } #endif -void laptop_mode_timer_fn(struct timer_list *t) -{ - struct backing_dev_info *backing_dev_info = - timer_container_of(backing_dev_info, t, laptop_mode_wb_timer); - - wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER); -} - -/* - * We've spun up the disk and we're in laptop mode: schedule writeback - * of all dirty data a few seconds from now. If the flush is already scheduled - * then push it back - the user is still using the disk. - */ -void laptop_io_completion(struct backing_dev_info *info) -{ - mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode); -} - -/* - * We're in laptop mode and we've just synced. The sync's writes will have - * caused another writeback to be scheduled by laptop_io_completion. - * Nothing needs to be written back anymore, so we unschedule the writeback. - */ -void laptop_sync_completion(void) -{ - struct backing_dev_info *bdi; - - rcu_read_lock(); - - list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) - timer_delete(&bdi->laptop_mode_wb_timer); - - rcu_read_unlock(); -} - /* * If ratelimit_pages is too high then we can get into dirty-data overload * if a large number of processes all perform writes at the same time. @@ -2263,6 +2206,19 @@ static int page_writeback_cpu_online(unsigned int cpu) #ifdef CONFIG_SYSCTL +static int laptop_mode; +static int laptop_mode_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret = proc_dointvec_jiffies(table, write, buffer, lenp, ppos); + + if (!ret && write) + pr_warn("%s: vm.laptop_mode is deprecated. Ignoring setting.\n", + current->comm); + + return ret; +} + /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; @@ -2332,7 +2288,7 @@ static const struct ctl_table vm_page_writeback_sysctls[] = { .data = &laptop_mode, .maxlen = sizeof(laptop_mode), .mode = 0644, - .proc_handler = proc_dointvec_jiffies, + .proc_handler = laptop_mode_handler, }, }; #endif diff --git a/mm/vmscan.c b/mm/vmscan.c index 1c87945fa761..fc5691afb998 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -104,13 +104,13 @@ struct scan_control { unsigned int force_deactivate:1; unsigned int skipped_deactivate:1; - /* Writepage batching in laptop mode; RECLAIM_WRITE */ + /* zone_reclaim_mode, boost reclaim */ unsigned int may_writepage:1; - /* Can mapped folios be reclaimed? */ + /* zone_reclaim_mode */ unsigned int may_unmap:1; - /* Can folios be swapped as part of reclaim? */ + /* zome_reclaim_mode, boost reclaim, cgroup restrictions */ unsigned int may_swap:1; /* Not allow cache_trim_mode to be turned on as part of reclaim? */ @@ -6365,13 +6365,6 @@ retry: if (sc->compaction_ready) break; - - /* - * If we're getting trouble reclaiming, start doing - * writepage even in laptop mode. - */ - if (sc->priority < DEF_PRIORITY - 2) - sc->may_writepage = 1; } while (--sc->priority >= 0); last_pgdat = NULL; @@ -6580,7 +6573,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .order = order, .nodemask = nodemask, .priority = DEF_PRIORITY, - .may_writepage = !laptop_mode, + .may_writepage = 1, .may_unmap = 1, .may_swap = 1, }; @@ -6624,7 +6617,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, struct scan_control sc = { .nr_to_reclaim = SWAP_CLUSTER_MAX, .target_mem_cgroup = memcg, - .may_writepage = !laptop_mode, + .may_writepage = 1, .may_unmap = 1, .reclaim_idx = MAX_NR_ZONES - 1, .may_swap = !noswap, @@ -6670,7 +6663,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .reclaim_idx = MAX_NR_ZONES - 1, .target_mem_cgroup = memcg, .priority = DEF_PRIORITY, - .may_writepage = !laptop_mode, + .may_writepage = 1, .may_unmap = 1, .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), @@ -7051,7 +7044,7 @@ restart: * from reclaim context. If no pages are reclaimed, the * reclaim will be aborted. */ - sc.may_writepage = !laptop_mode && !nr_boost_reclaim; + sc.may_writepage = !nr_boost_reclaim; sc.may_swap = !nr_boost_reclaim; /* @@ -7061,13 +7054,6 @@ restart: */ kswapd_age_node(pgdat, &sc); - /* - * If we're getting trouble reclaiming, start doing writepage - * even in laptop mode. - */ - if (sc.priority < DEF_PRIORITY - 2) - sc.may_writepage = 1; - /* Call soft limit reclaim before calling shrink_node. */ sc.nr_scanned = 0; nr_soft_scanned = 0; @@ -7799,7 +7785,7 @@ int user_proactive_reclaim(char *buf, .reclaim_idx = gfp_zone(gfp_mask), .proactive_swappiness = swappiness == -1 ? NULL : &swappiness, .priority = DEF_PRIORITY, - .may_writepage = !laptop_mode, + .may_writepage = 1, .nr_to_reclaim = max(batch_size, SWAP_CLUSTER_MAX), .may_unmap = 1, .may_swap = 1, -- cgit v1.2.3