From 804c26b961da295bd70c86a3c9dc4bea0b09de88 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 16 Dec 2025 00:01:25 -0800
Subject: mm/damon/core: add trace point for damos stat per apply interval

DAMON users can read DAMOS stats via DAMON sysfs interface.  It enables
efficient, simple and flexible usages of the stats.  Especially for
systems not having advanced tools like perf or bpftrace, that can be
useful.  But if the advanced tools are available, exposing the stats via
tracepoint can reduce unnecessary reimplementation of the wheels.  Add a
new tracepoint for DAMOS stats, namely damos_stat_after_apply_interval.
The tracepoint is triggered for each scheme's apply interval and exposes
the whole stat values.  If the user needs sub-apply interval information
for any chance, damos_before_apply tracepoint could be used.

Link: https://lkml.kernel.org/r/20251216080128.42991-13-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/damon.h | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

(limited to 'include/trace')

diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h
index 852d725afea2..24fc402ab3c8 100644
--- a/include/trace/events/damon.h
+++ b/include/trace/events/damon.h
@@ -9,6 +9,47 @@
 #include <linux/types.h>
 #include <linux/tracepoint.h>
 
+TRACE_EVENT(damos_stat_after_apply_interval,
+
+	TP_PROTO(unsigned int context_idx, unsigned int scheme_idx,
+		struct damos_stat *stat),
+
+	TP_ARGS(context_idx, scheme_idx, stat),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, context_idx)
+		__field(unsigned int, scheme_idx)
+		__field(unsigned long, nr_tried)
+		__field(unsigned long, sz_tried)
+		__field(unsigned long, nr_applied)
+		__field(unsigned long, sz_applied)
+		__field(unsigned long, sz_ops_filter_passed)
+		__field(unsigned long, qt_exceeds)
+		__field(unsigned long, nr_snapshots)
+	),
+
+	TP_fast_assign(
+		__entry->context_idx = context_idx;
+		__entry->scheme_idx = scheme_idx;
+		__entry->nr_tried = stat->nr_tried;
+		__entry->sz_tried = stat->sz_tried;
+		__entry->nr_applied = stat->nr_applied;
+		__entry->sz_applied = stat->sz_applied;
+		__entry->sz_ops_filter_passed = stat->sz_ops_filter_passed;
+		__entry->qt_exceeds = stat->qt_exceeds;
+		__entry->nr_snapshots = stat->nr_snapshots;
+	),
+
+	TP_printk("ctx_idx=%u scheme_idx=%u nr_tried=%lu sz_tried=%lu "
+			"nr_applied=%lu sz_tried=%lu sz_ops_filter_passed=%lu "
+			"qt_exceeds=%lu nr_snapshots=%lu",
+			__entry->context_idx, __entry->scheme_idx,
+			__entry->nr_tried, __entry->sz_tried,
+			__entry->nr_applied, __entry->sz_applied,
+			__entry->sz_ops_filter_passed, __entry->qt_exceeds,
+			__entry->nr_snapshots)
+);
+
 TRACE_EVENT(damos_esz,
 
 	TP_PROTO(unsigned int context_idx, unsigned int scheme_idx,
-- 
cgit v1.2.3


From 64dd89ae01f2708a508e028c28b7906e4702a9a7 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 15 Dec 2025 12:57:53 -0500
Subject: mm/block/fs: remove laptop_mode

Laptop mode was introduced to save battery, by delaying and consolidating
writes and thereby maximize the time rotating hard drives wouldn't have to
spin.

Luckily, rotating hard drives, with their high spin-up times and power
draw, are a thing of the past for battery-powered devices.  Reclaim has
also since changed to not write single filesystem pages anymore, and
regular filesystem writeback is lumpy by design.

The juice doesn't appear worth the squeeze anymore.  The footprint of the
feature is small, but nevertheless it's a complicating factor in mm,
block, filesystems.  Developers don't think about it, and it likely hasn't
been tested with new reclaim and writeback changes in years.

Let's sunset it.  Keep the sysctl with a deprecation warning around for a
few more cycles, but remove all functionality behind it.

[akpm@linux-foundation.org: fix Documentation/admin-guide/laptops/index.rst]
Link: https://lkml.kernel.org/r/20251216185201.GH905277@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Suggested-by: Christoph Hellwig <hch@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Deepanshu Kartikey <kartikey406@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/laptops/index.rst       |   1 -
 Documentation/admin-guide/laptops/laptop-mode.rst | 770 ----------------------
 Documentation/admin-guide/sysctl/vm.rst           |   8 -
 block/blk-mq.c                                    |   3 -
 fs/ext4/inode.c                                   |   3 +-
 fs/sync.c                                         |   2 -
 fs/xfs/xfs_super.c                                |   9 -
 include/linux/backing-dev-defs.h                  |   3 -
 include/linux/writeback.h                         |   4 -
 include/trace/events/writeback.h                  |   1 -
 include/uapi/linux/sysctl.h                       |   2 +-
 mm/backing-dev.c                                  |   3 -
 mm/page-writeback.c                               |  74 +--
 mm/vmscan.c                                       |  30 +-
 14 files changed, 25 insertions(+), 888 deletions(-)
 delete mode 100644 Documentation/admin-guide/laptops/laptop-mode.rst

(limited to 'include/trace')

diff --git a/Documentation/admin-guide/laptops/index.rst b/Documentation/admin-guide/laptops/index.rst
index 6432c251dc95..c0b911d05c59 100644
--- a/Documentation/admin-guide/laptops/index.rst
+++ b/Documentation/admin-guide/laptops/index.rst
@@ -10,7 +10,6 @@ Laptop Drivers
    alienware-wmi
    asus-laptop
    disk-shock-protection
-   laptop-mode
    lg-laptop
    samsung-galaxybook
    sony-laptop
diff --git a/Documentation/admin-guide/laptops/laptop-mode.rst b/Documentation/admin-guide/laptops/laptop-mode.rst
deleted file mode 100644
index 66eb9cd918b5..000000000000
--- a/Documentation/admin-guide/laptops/laptop-mode.rst
+++ /dev/null
@@ -1,770 +0,0 @@
-===============================================
-How to conserve battery power using laptop-mode
-===============================================
-
-Document Author: Bart Samwel (bart@samwel.tk)
-
-Date created: January 2, 2004
-
-Last modified: December 06, 2004
-
-Introduction
-------------
-
-Laptop mode is used to minimize the time that the hard disk needs to be spun up,
-to conserve battery power on laptops. It has been reported to cause significant
-power savings.
-
-.. Contents
-
-   * Introduction
-   * Installation
-   * Caveats
-   * The Details
-   * Tips & Tricks
-   * Control script
-   * ACPI integration
-   * Monitoring tool
-
-
-Installation
-------------
-
-To use laptop mode, you don't need to set any kernel configuration options
-or anything. Simply install all the files included in this document, and
-laptop mode will automatically be started when you're on battery. For
-your convenience, a tarball containing an installer can be downloaded at:
-
-	http://www.samwel.tk/laptop_mode/laptop_mode/
-
-To configure laptop mode, you need to edit the configuration file, which is
-located in /etc/default/laptop-mode on Debian-based systems, or in
-/etc/sysconfig/laptop-mode on other systems.
-
-Unfortunately, automatic enabling of laptop mode does not work for
-laptops that don't have ACPI. On those laptops, you need to start laptop
-mode manually. To start laptop mode, run "laptop_mode start", and to
-stop it, run "laptop_mode stop". (Note: The laptop mode tools package now
-has experimental support for APM, you might want to try that first.)
-
-
-Caveats
--------
-
-* The downside of laptop mode is that you have a chance of losing up to 10
-  minutes of work. If you cannot afford this, don't use it! The supplied ACPI
-  scripts automatically turn off laptop mode when the battery almost runs out,
-  so that you won't lose any data at the end of your battery life.
-
-* Most desktop hard drives have a very limited lifetime measured in spindown
-  cycles, typically about 50.000 times (it's usually listed on the spec sheet).
-  Check your drive's rating, and don't wear down your drive's lifetime if you
-  don't need to.
-
-* If you mount some of your ext3 filesystems with the -n option, then
-  the control script will not be able to remount them correctly. You must set
-  DO_REMOUNTS=0 in the control script, otherwise it will remount them with the
-  wrong options -- or it will fail because it cannot write to /etc/mtab.
-
-* If you have your filesystems listed as type "auto" in fstab, like I did, then
-  the control script will not recognize them as filesystems that need remounting.
-  You must list the filesystems with their true type instead.
-
-* It has been reported that some versions of the mutt mail client use file access
-  times to determine whether a folder contains new mail. If you use mutt and
-  experience this, you must disable the noatime remounting by setting the option
-  DO_REMOUNT_NOATIME to 0 in the configuration file.
-
-
-The Details
------------
-
-Laptop mode is controlled by the knob /proc/sys/vm/laptop_mode. This knob is
-present for all kernels that have the laptop mode patch, regardless of any
-configuration options. When the knob is set, any physical disk I/O (that might
-have caused the hard disk to spin up) causes Linux to flush all dirty blocks. The
-result of this is that after a disk has spun down, it will not be spun up
-anymore to write dirty blocks, because those blocks had already been written
-immediately after the most recent read operation. The value of the laptop_mode
-knob determines the time between the occurrence of disk I/O and when the flush
-is triggered. A sensible value for the knob is 5 seconds. Setting the knob to
-0 disables laptop mode.
-
-To increase the effectiveness of the laptop_mode strategy, the laptop_mode
-control script increases dirty_expire_centisecs and dirty_writeback_centisecs in
-/proc/sys/vm to about 10 minutes (by default), which means that pages that are
-dirtied are not forced to be written to disk as often. The control script also
-changes the dirty background ratio, so that background writeback of dirty pages
-is not done anymore. Combined with a higher commit value (also 10 minutes) for
-ext3 filesystem (also done automatically by the control script),
-this results in concentration of disk activity in a small time interval which
-occurs only once every 10 minutes, or whenever the disk is forced to spin up by
-a cache miss. The disk can then be spun down in the periods of inactivity.
-
-
-Configuration
--------------
-
-The laptop mode configuration file is located in /etc/default/laptop-mode on
-Debian-based systems, or in /etc/sysconfig/laptop-mode on other systems. It
-contains the following options:
-
-MAX_AGE:
-
-Maximum time, in seconds, of hard drive spindown time that you are
-comfortable with. Worst case, it's possible that you could lose this
-amount of work if your battery fails while you're in laptop mode.
-
-MINIMUM_BATTERY_MINUTES:
-
-Automatically disable laptop mode if the remaining number of minutes of
-battery power is less than this value. Default is 10 minutes.
-
-AC_HD/BATT_HD:
-
-The idle timeout that should be set on your hard drive when laptop mode
-is active (BATT_HD) and when it is not active (AC_HD). The defaults are
-20 seconds (value 4) for BATT_HD  and 2 hours (value 244) for AC_HD. The
-possible values are those listed in the manual page for "hdparm" for the
-"-S" option.
-
-HD:
-
-The devices for which the spindown timeout should be adjusted by laptop mode.
-Default is /dev/hda. If you specify multiple devices, separate them by a space.
-
-READAHEAD:
-
-Disk readahead, in 512-byte sectors, while laptop mode is active. A large
-readahead can prevent disk accesses for things like executable pages (which are
-loaded on demand while the application executes) and sequentially accessed data
-(MP3s).
-
-DO_REMOUNTS:
-
-The control script automatically remounts any mounted journaled filesystems
-with appropriate commit interval options. When this option is set to 0, this
-feature is disabled.
-
-DO_REMOUNT_NOATIME:
-
-When remounting, should the filesystems be remounted with the noatime option?
-Normally, this is set to "1" (enabled), but there may be programs that require
-access time recording.
-
-DIRTY_RATIO:
-
-The percentage of memory that is allowed to contain "dirty" or unsaved data
-before a writeback is forced, while laptop mode is active. Corresponds to
-the /proc/sys/vm/dirty_ratio sysctl.
-
-DIRTY_BACKGROUND_RATIO:
-
-The percentage of memory that is allowed to contain "dirty" or unsaved data
-after a forced writeback is done due to an exceeding of DIRTY_RATIO. Set
-this nice and low. This corresponds to the /proc/sys/vm/dirty_background_ratio
-sysctl.
-
-Note that the behaviour of dirty_background_ratio is quite different
-when laptop mode is active and when it isn't. When laptop mode is inactive,
-dirty_background_ratio is the threshold percentage at which background writeouts
-start taking place. When laptop mode is active, however, background writeouts
-are disabled, and the dirty_background_ratio only determines how much writeback
-is done when dirty_ratio is reached.
-
-DO_CPU:
-
-Enable CPU frequency scaling when in laptop mode. (Requires CPUFreq to be setup.
-See Documentation/admin-guide/pm/cpufreq.rst for more info. Disabled by default.)
-
-CPU_MAXFREQ:
-
-When on battery, what is the maximum CPU speed that the system should use? Legal
-values are "slowest" for the slowest speed that your CPU is able to operate at,
-or a value listed in /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies.
-
-
-Tips & Tricks
--------------
-
-* Bartek Kania reports getting up to 50 minutes of extra battery life (on top
-  of his regular 3 to 3.5 hours) using a spindown time of 5 seconds (BATT_HD=1).
-
-* You can spin down the disk while playing MP3, by setting disk readahead
-  to 8MB (READAHEAD=16384). Effectively, the disk will read a complete MP3 at
-  once, and will then spin down while the MP3 is playing. (Thanks to Bartek
-  Kania.)
-
-* Drew Scott Daniels observed: "I don't know why, but when I decrease the number
-  of colours that my display uses it consumes less battery power. I've seen
-  this on powerbooks too. I hope that this is a piece of information that
-  might be useful to the Laptop Mode patch or its users."
-
-* In syslog.conf, you can prefix entries with a dash `-` to omit syncing the
-  file after every logging. When you're using laptop-mode and your disk doesn't
-  spin down, this is a likely culprit.
-
-* Richard Atterer observed that laptop mode does not work well with noflushd
-  (http://noflushd.sourceforge.net/), it seems that noflushd prevents laptop-mode
-  from doing its thing.
-
-* If you're worried about your data, you might want to consider using a USB
-  memory stick or something like that as a "working area". (Be aware though
-  that flash memory can only handle a limited number of writes, and overuse
-  may wear out your memory stick pretty quickly. Do _not_ use journalling
-  filesystems on flash memory sticks.)
-
-
-Configuration file for control and ACPI battery scripts
--------------------------------------------------------
-
-This allows the tunables to be changed for the scripts via an external
-configuration file
-
-It should be installed as /etc/default/laptop-mode on Debian, and as
-/etc/sysconfig/laptop-mode on Red Hat, SUSE, Mandrake, and other work-alikes.
-
-Config file::
-
-  # Maximum time, in seconds, of hard drive spindown time that you are
-  # comfortable with. Worst case, it's possible that you could lose this
-  # amount of work if your battery fails you while in laptop mode.
-  #MAX_AGE=600
-
-  # Automatically disable laptop mode when the number of minutes of battery
-  # that you have left goes below this threshold.
-  MINIMUM_BATTERY_MINUTES=10
-
-  # Read-ahead, in 512-byte sectors. You can spin down the disk while playing MP3/OGG
-  # by setting the disk readahead to 8MB (READAHEAD=16384). Effectively, the disk
-  # will read a complete MP3 at once, and will then spin down while the MP3/OGG is
-  # playing.
-  #READAHEAD=4096
-
-  # Shall we remount journaled fs. with appropriate commit interval? (1=yes)
-  #DO_REMOUNTS=1
-
-  # And shall we add the "noatime" option to that as well? (1=yes)
-  #DO_REMOUNT_NOATIME=1
-
-  # Dirty synchronous ratio.  At this percentage of dirty pages the process
-  # which
-  # calls write() does its own writeback
-  #DIRTY_RATIO=40
-
-  #
-  # Allowed dirty background ratio, in percent.  Once DIRTY_RATIO has been
-  # exceeded, the kernel will wake flusher threads which will then reduce the
-  # amount of dirty memory to dirty_background_ratio.  Set this nice and low,
-  # so once some writeout has commenced, we do a lot of it.
-  #
-  #DIRTY_BACKGROUND_RATIO=5
-
-  # kernel default dirty buffer age
-  #DEF_AGE=30
-  #DEF_UPDATE=5
-  #DEF_DIRTY_BACKGROUND_RATIO=10
-  #DEF_DIRTY_RATIO=40
-  #DEF_XFS_AGE_BUFFER=15
-  #DEF_XFS_SYNC_INTERVAL=30
-  #DEF_XFS_BUFD_INTERVAL=1
-
-  # This must be adjusted manually to the value of HZ in the running kernel
-  # on 2.4, until the XFS people change their 2.4 external interfaces to work in
-  # centisecs. This can be automated, but it's a work in progress that still
-  # needs# some fixes. On 2.6 kernels, XFS uses USER_HZ instead of HZ for
-  # external interfaces, and that is currently always set to 100. So you don't
-  # need to change this on 2.6.
-  #XFS_HZ=100
-
-  # Should the maximum CPU frequency be adjusted down while on battery?
-  # Requires CPUFreq to be setup.
-  # See Documentation/admin-guide/pm/cpufreq.rst for more info
-  #DO_CPU=0
-
-  # When on battery what is the maximum CPU speed that the system should
-  # use? Legal values are "slowest" for the slowest speed that your
-  # CPU is able to operate at, or a value listed in:
-  # /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies
-  # Only applicable if DO_CPU=1.
-  #CPU_MAXFREQ=slowest
-
-  # Idle timeout for your hard drive (man hdparm for valid values, -S option)
-  # Default is 2 hours on AC (AC_HD=244) and 20 seconds for battery (BATT_HD=4).
-  #AC_HD=244
-  #BATT_HD=4
-
-  # The drives for which to adjust the idle timeout. Separate them by a space,
-  # e.g. HD="/dev/hda /dev/hdb".
-  #HD="/dev/hda"
-
-  # Set the spindown timeout on a hard drive?
-  #DO_HD=1
-
-
-Control script
---------------
-
-Please note that this control script works for the Linux 2.4 and 2.6 series (thanks
-to Kiko Piris).
-
-Control script::
-
-  #!/bin/bash
-
-  # start or stop laptop_mode, best run by a power management daemon when
-  # ac gets connected/disconnected from a laptop
-  #
-  # install as /sbin/laptop_mode
-  #
-  # Contributors to this script:   Kiko Piris
-  #				 Bart Samwel
-  #				 Micha Feigin
-  #				 Andrew Morton
-  #				 Herve Eychenne
-  #				 Dax Kelson
-  #
-  # Original Linux 2.4 version by: Jens Axboe
-
-  #############################################################################
-
-  # Source config
-  if [ -f /etc/default/laptop-mode ] ; then
-	# Debian
-	. /etc/default/laptop-mode
-  elif [ -f /etc/sysconfig/laptop-mode ] ; then
-	# Others
-          . /etc/sysconfig/laptop-mode
-  fi
-
-  # Don't raise an error if the config file is incomplete
-  # set defaults instead:
-
-  # Maximum time, in seconds, of hard drive spindown time that you are
-  # comfortable with. Worst case, it's possible that you could lose this
-  # amount of work if your battery fails you while in laptop mode.
-  MAX_AGE=${MAX_AGE:-'600'}
-
-  # Read-ahead, in kilobytes
-  READAHEAD=${READAHEAD:-'4096'}
-
-  # Shall we remount journaled fs. with appropriate commit interval? (1=yes)
-  DO_REMOUNTS=${DO_REMOUNTS:-'1'}
-
-  # And shall we add the "noatime" option to that as well? (1=yes)
-  DO_REMOUNT_NOATIME=${DO_REMOUNT_NOATIME:-'1'}
-
-  # Shall we adjust the idle timeout on a hard drive?
-  DO_HD=${DO_HD:-'1'}
-
-  # Adjust idle timeout on which hard drive?
-  HD="${HD:-'/dev/hda'}"
-
-  # spindown time for HD (hdparm -S values)
-  AC_HD=${AC_HD:-'244'}
-  BATT_HD=${BATT_HD:-'4'}
-
-  # Dirty synchronous ratio.  At this percentage of dirty pages the process which
-  # calls write() does its own writeback
-  DIRTY_RATIO=${DIRTY_RATIO:-'40'}
-
-  # cpu frequency scaling
-  # See Documentation/admin-guide/pm/cpufreq.rst for more info
-  DO_CPU=${CPU_MANAGE:-'0'}
-  CPU_MAXFREQ=${CPU_MAXFREQ:-'slowest'}
-
-  #
-  # Allowed dirty background ratio, in percent.  Once DIRTY_RATIO has been
-  # exceeded, the kernel will wake flusher threads which will then reduce the
-  # amount of dirty memory to dirty_background_ratio.  Set this nice and low,
-  # so once some writeout has commenced, we do a lot of it.
-  #
-  DIRTY_BACKGROUND_RATIO=${DIRTY_BACKGROUND_RATIO:-'5'}
-
-  # kernel default dirty buffer age
-  DEF_AGE=${DEF_AGE:-'30'}
-  DEF_UPDATE=${DEF_UPDATE:-'5'}
-  DEF_DIRTY_BACKGROUND_RATIO=${DEF_DIRTY_BACKGROUND_RATIO:-'10'}
-  DEF_DIRTY_RATIO=${DEF_DIRTY_RATIO:-'40'}
-  DEF_XFS_AGE_BUFFER=${DEF_XFS_AGE_BUFFER:-'15'}
-  DEF_XFS_SYNC_INTERVAL=${DEF_XFS_SYNC_INTERVAL:-'30'}
-  DEF_XFS_BUFD_INTERVAL=${DEF_XFS_BUFD_INTERVAL:-'1'}
-
-  # This must be adjusted manually to the value of HZ in the running kernel
-  # on 2.4, until the XFS people change their 2.4 external interfaces to work in
-  # centisecs. This can be automated, but it's a work in progress that still needs
-  # some fixes. On 2.6 kernels, XFS uses USER_HZ instead of HZ for external
-  # interfaces, and that is currently always set to 100. So you don't need to
-  # change this on 2.6.
-  XFS_HZ=${XFS_HZ:-'100'}
-
-  #############################################################################
-
-  KLEVEL="$(uname -r |
-               {
-	       IFS='.' read a b c
-	       echo $a.$b
-	     }
-  )"
-  case "$KLEVEL" in
-	"2.4"|"2.6")
-		;;
-	*)
-		echo "Unhandled kernel version: $KLEVEL ('uname -r' = '$(uname -r)')" >&2
-		exit 1
-		;;
-  esac
-
-  if [ ! -e /proc/sys/vm/laptop_mode ] ; then
-	echo "Kernel is not patched with laptop_mode patch." >&2
-	exit 1
-  fi
-
-  if [ ! -w /proc/sys/vm/laptop_mode ] ; then
-	echo "You do not have enough privileges to enable laptop_mode." >&2
-	exit 1
-  fi
-
-  # Remove an option (the first parameter) of the form option=<number> from
-  # a mount options string (the rest of the parameters).
-  parse_mount_opts () {
-	OPT="$1"
-	shift
-	echo ",$*," | sed		\
-	 -e 's/,'"$OPT"'=[0-9]*,/,/g'	\
-	 -e 's/,,*/,/g'			\
-	 -e 's/^,//'			\
-	 -e 's/,$//'
-  }
-
-  # Remove an option (the first parameter) without any arguments from
-  # a mount option string (the rest of the parameters).
-  parse_nonumber_mount_opts () {
-	OPT="$1"
-	shift
-	echo ",$*," | sed		\
-	 -e 's/,'"$OPT"',/,/g'		\
-	 -e 's/,,*/,/g'			\
-	 -e 's/^,//'			\
-	 -e 's/,$//'
-  }
-
-  # Find out the state of a yes/no option (e.g. "atime"/"noatime") in
-  # fstab for a given filesystem, and use this state to replace the
-  # value of the option in another mount options string. The device
-  # is the first argument, the option name the second, and the default
-  # value the third. The remainder is the mount options string.
-  #
-  # Example:
-  # parse_yesno_opts_wfstab /dev/hda1 atime atime defaults,noatime
-  #
-  # If fstab contains, say, "rw" for this filesystem, then the result
-  # will be "defaults,atime".
-  parse_yesno_opts_wfstab () {
-	L_DEV="$1"
-	OPT="$2"
-	DEF_OPT="$3"
-	shift 3
-	L_OPTS="$*"
-	PARSEDOPTS1="$(parse_nonumber_mount_opts $OPT $L_OPTS)"
-	PARSEDOPTS1="$(parse_nonumber_mount_opts no$OPT $PARSEDOPTS1)"
-	# Watch for a default atime in fstab
-	FSTAB_OPTS="$(awk '$1 == "'$L_DEV'" { print $4 }' /etc/fstab)"
-	if echo "$FSTAB_OPTS" | grep "$OPT" > /dev/null ; then
-		# option specified in fstab: extract the value and use it
-		if echo "$FSTAB_OPTS" | grep "no$OPT" > /dev/null ; then
-			echo "$PARSEDOPTS1,no$OPT"
-		else
-			# no$OPT not found -- so we must have $OPT.
-			echo "$PARSEDOPTS1,$OPT"
-		fi
-	else
-		# option not specified in fstab -- choose the default.
-		echo "$PARSEDOPTS1,$DEF_OPT"
-	fi
-  }
-
-  # Find out the state of a numbered option (e.g. "commit=NNN") in
-  # fstab for a given filesystem, and use this state to replace the
-  # value of the option in another mount options string. The device
-  # is the first argument, and the option name the second. The
-  # remainder is the mount options string in which the replacement
-  # must be done.
-  #
-  # Example:
-  # parse_mount_opts_wfstab /dev/hda1 commit defaults,commit=7
-  #
-  # If fstab contains, say, "commit=3,rw" for this filesystem, then the
-  # result will be "rw,commit=3".
-  parse_mount_opts_wfstab () {
-	L_DEV="$1"
-	OPT="$2"
-	shift 2
-	L_OPTS="$*"
-	PARSEDOPTS1="$(parse_mount_opts $OPT $L_OPTS)"
-	# Watch for a default commit in fstab
-	FSTAB_OPTS="$(awk '$1 == "'$L_DEV'" { print $4 }' /etc/fstab)"
-	if echo "$FSTAB_OPTS" | grep "$OPT=" > /dev/null ; then
-		# option specified in fstab: extract the value, and use it
-		echo -n "$PARSEDOPTS1,$OPT="
-		echo ",$FSTAB_OPTS," | sed \
-		 -e 's/.*,'"$OPT"'=//'	\
-		 -e 's/,.*//'
-	else
-		# option not specified in fstab: set it to 0
-		echo "$PARSEDOPTS1,$OPT=0"
-	fi
-  }
-
-  deduce_fstype () {
-	MP="$1"
-	# My root filesystem unfortunately has
-	# type "unknown" in /etc/mtab. If we encounter
-	# "unknown", we try to get the type from fstab.
-	cat /etc/fstab |
-	grep -v '^#' |
-	while read FSTAB_DEV FSTAB_MP FSTAB_FST FSTAB_OPTS FSTAB_DUMP FSTAB_DUMP ; do
-		if [ "$FSTAB_MP" = "$MP" ]; then
-			echo $FSTAB_FST
-			exit 0
-		fi
-	done
-  }
-
-  if [ $DO_REMOUNT_NOATIME -eq 1 ] ; then
-	NOATIME_OPT=",noatime"
-  fi
-
-  case "$1" in
-	start)
-		AGE=$((100*$MAX_AGE))
-		XFS_AGE=$(($XFS_HZ*$MAX_AGE))
-		echo -n "Starting laptop_mode"
-
-		if [ -d /proc/sys/vm/pagebuf ] ; then
-			# (For 2.4 and early 2.6.)
-			# This only needs to be set, not reset -- it is only used when
-			# laptop mode is enabled.
-			echo $XFS_AGE > /proc/sys/vm/pagebuf/lm_flush_age
-			echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval
-		elif [ -f /proc/sys/fs/xfs/lm_age_buffer ] ; then
-			# (A couple of early 2.6 laptop mode patches had these.)
-			# The same goes for these.
-			echo $XFS_AGE > /proc/sys/fs/xfs/lm_age_buffer
-			echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval
-		elif [ -f /proc/sys/fs/xfs/age_buffer ] ; then
-			# (2.6.6)
-			# But not for these -- they are also used in normal
-			# operation.
-			echo $XFS_AGE > /proc/sys/fs/xfs/age_buffer
-			echo $XFS_AGE > /proc/sys/fs/xfs/sync_interval
-		elif [ -f /proc/sys/fs/xfs/age_buffer_centisecs ] ; then
-			# (2.6.7 upwards)
-			# And not for these either. These are in centisecs,
-			# not USER_HZ, so we have to use $AGE, not $XFS_AGE.
-			echo $AGE > /proc/sys/fs/xfs/age_buffer_centisecs
-			echo $AGE > /proc/sys/fs/xfs/xfssyncd_centisecs
-			echo 3000 > /proc/sys/fs/xfs/xfsbufd_centisecs
-		fi
-
-		case "$KLEVEL" in
-			"2.4")
-				echo 1					> /proc/sys/vm/laptop_mode
-				echo "30 500 0 0 $AGE $AGE 60 20 0"	> /proc/sys/vm/bdflush
-				;;
-			"2.6")
-				echo 5					> /proc/sys/vm/laptop_mode
-				echo "$AGE"				> /proc/sys/vm/dirty_writeback_centisecs
-				echo "$AGE"				> /proc/sys/vm/dirty_expire_centisecs
-				echo "$DIRTY_RATIO"			> /proc/sys/vm/dirty_ratio
-				echo "$DIRTY_BACKGROUND_RATIO"		> /proc/sys/vm/dirty_background_ratio
-				;;
-		esac
-		if [ $DO_REMOUNTS -eq 1 ]; then
-			cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do
-				PARSEDOPTS="$(parse_mount_opts "$OPTS")"
-				if [ "$FST" = 'unknown' ]; then
-					FST=$(deduce_fstype $MP)
-				fi
-				case "$FST" in
-					"ext3")
-						PARSEDOPTS="$(parse_mount_opts commit "$OPTS")"
-						mount $DEV -t $FST $MP -o remount,$PARSEDOPTS,commit=$MAX_AGE$NOATIME_OPT
-						;;
-					"xfs")
-						mount $DEV -t $FST $MP -o remount,$OPTS$NOATIME_OPT
-						;;
-				esac
-				if [ -b $DEV ] ; then
-					blockdev --setra $(($READAHEAD * 2)) $DEV
-				fi
-			done
-		fi
-		if [ $DO_HD -eq 1 ] ; then
-			for THISHD in $HD ; do
-				/sbin/hdparm -S $BATT_HD $THISHD > /dev/null 2>&1
-				/sbin/hdparm -B 1 $THISHD > /dev/null 2>&1
-			done
-		fi
-		if [ $DO_CPU -eq 1 -a -e /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq ]; then
-			if [ $CPU_MAXFREQ = 'slowest' ]; then
-				CPU_MAXFREQ=`cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq`
-			fi
-			echo $CPU_MAXFREQ > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
-		fi
-		echo "."
-		;;
-	stop)
-		U_AGE=$((100*$DEF_UPDATE))
-		B_AGE=$((100*$DEF_AGE))
-		echo -n "Stopping laptop_mode"
-		echo 0 > /proc/sys/vm/laptop_mode
-		if [ -f /proc/sys/fs/xfs/age_buffer -a ! -f /proc/sys/fs/xfs/lm_age_buffer ] ; then
-			# These need to be restored, if there are no lm_*.
-			echo $(($XFS_HZ*$DEF_XFS_AGE_BUFFER))	 	> /proc/sys/fs/xfs/age_buffer
-			echo $(($XFS_HZ*$DEF_XFS_SYNC_INTERVAL)) 	> /proc/sys/fs/xfs/sync_interval
-		elif [ -f /proc/sys/fs/xfs/age_buffer_centisecs ] ; then
-			# These need to be restored as well.
-			echo $((100*$DEF_XFS_AGE_BUFFER))	> /proc/sys/fs/xfs/age_buffer_centisecs
-			echo $((100*$DEF_XFS_SYNC_INTERVAL))	> /proc/sys/fs/xfs/xfssyncd_centisecs
-			echo $((100*$DEF_XFS_BUFD_INTERVAL))	> /proc/sys/fs/xfs/xfsbufd_centisecs
-		fi
-		case "$KLEVEL" in
-			"2.4")
-				echo "30 500 0 0 $U_AGE $B_AGE 60 20 0"	> /proc/sys/vm/bdflush
-				;;
-			"2.6")
-				echo "$U_AGE"				> /proc/sys/vm/dirty_writeback_centisecs
-				echo "$B_AGE"				> /proc/sys/vm/dirty_expire_centisecs
-				echo "$DEF_DIRTY_RATIO"			> /proc/sys/vm/dirty_ratio
-				echo "$DEF_DIRTY_BACKGROUND_RATIO"	> /proc/sys/vm/dirty_background_ratio
-				;;
-		esac
-		if [ $DO_REMOUNTS -eq 1 ] ; then
-			cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do
-				# Reset commit and atime options to defaults.
-				if [ "$FST" = 'unknown' ]; then
-					FST=$(deduce_fstype $MP)
-				fi
-				case "$FST" in
-					"ext3")
-						PARSEDOPTS="$(parse_mount_opts_wfstab $DEV commit $OPTS)"
-						PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $PARSEDOPTS)"
-						mount $DEV -t $FST $MP -o remount,$PARSEDOPTS
-						;;
-					"xfs")
-						PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $OPTS)"
-						mount $DEV -t $FST $MP -o remount,$PARSEDOPTS
-						;;
-				esac
-				if [ -b $DEV ] ; then
-					blockdev --setra 256 $DEV
-				fi
-			done
-		fi
-		if [ $DO_HD -eq 1 ] ; then
-			for THISHD in $HD ; do
-				/sbin/hdparm -S $AC_HD $THISHD > /dev/null 2>&1
-				/sbin/hdparm -B 255 $THISHD > /dev/null 2>&1
-			done
-		fi
-		if [ $DO_CPU -eq 1 -a -e /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq ]; then
-			echo `cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq` > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
-		fi
-		echo "."
-		;;
-	*)
-		echo "Usage: $0 {start|stop}" 2>&1
-		exit 1
-		;;
-
-  esac
-
-  exit 0
-
-
-ACPI integration
-----------------
-
-Dax Kelson submitted this so that the ACPI acpid daemon will
-kick off the laptop_mode script and run hdparm. The part that
-automatically disables laptop mode when the battery is low was
-written by Jan Topinski.
-
-/etc/acpi/events/ac_adapter::
-
-	event=ac_adapter
-	action=/etc/acpi/actions/ac.sh %e
-
-/etc/acpi/events/battery::
-
-	event=battery.*
-	action=/etc/acpi/actions/battery.sh %e
-
-/etc/acpi/actions/ac.sh::
-
-  #!/bin/bash
-
-  # ac on/offline event handler
-
-  status=`awk '/^state: / { print $2 }' /proc/acpi/ac_adapter/$2/state`
-
-  case $status in
-          "on-line")
-                  /sbin/laptop_mode stop
-                  exit 0
-          ;;
-          "off-line")
-                  /sbin/laptop_mode start
-                  exit 0
-          ;;
-  esac
-
-
-/etc/acpi/actions/battery.sh::
-
-  #! /bin/bash
-
-  # Automatically disable laptop mode when the battery almost runs out.
-
-  BATT_INFO=/proc/acpi/battery/$2/state
-
-  if [[ -f /proc/sys/vm/laptop_mode ]]
-  then
-     LM=`cat /proc/sys/vm/laptop_mode`
-     if [[ $LM -gt 0 ]]
-     then
-       if [[ -f $BATT_INFO ]]
-       then
-          # Source the config file only now that we know we need
-          if [ -f /etc/default/laptop-mode ] ; then
-                  # Debian
-                  . /etc/default/laptop-mode
-          elif [ -f /etc/sysconfig/laptop-mode ] ; then
-                  # Others
-                  . /etc/sysconfig/laptop-mode
-          fi
-          MINIMUM_BATTERY_MINUTES=${MINIMUM_BATTERY_MINUTES:-'10'}
-
-          ACTION="`cat $BATT_INFO | grep charging | cut -c 26-`"
-          if [[ ACTION -eq "discharging" ]]
-          then
-             PRESENT_RATE=`cat $BATT_INFO | grep "present rate:" | sed  "s/.* \([0-9][0-9]* \).*/\1/" `
-             REMAINING=`cat $BATT_INFO | grep "remaining capacity:" | sed  "s/.* \([0-9][0-9]* \).*/\1/" `
-          fi
-          if (($REMAINING * 60 / $PRESENT_RATE < $MINIMUM_BATTERY_MINUTES))
-          then
-             /sbin/laptop_mode stop
-          fi
-       else
-         logger -p daemon.warning "You are using laptop mode and your battery interface $BATT_INFO is missing. This may lead to loss of data when the battery runs out. Check kernel ACPI support and /proc/acpi/battery folder, and edit /etc/acpi/battery.sh to set BATT_INFO to the correct path."
-       fi
-     fi
-  fi
-
-
-Monitoring tool
----------------
-
-Bartek Kania submitted this, it can be used to measure how much time your disk
-spends spun up/down.  See tools/laptop/dslm/dslm.c
diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index 245bf6394935..ca6ebeb5171c 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -41,7 +41,6 @@ Currently, these files are in /proc/sys/vm:
 - extfrag_threshold
 - highmem_is_dirtyable
 - hugetlb_shm_group
-- laptop_mode
 - legacy_va_layout
 - lowmem_reserve_ratio
 - max_map_count
@@ -363,13 +362,6 @@ hugetlb_shm_group contains group id that is allowed to create SysV
 shared memory segment using hugetlb page.
 
 
-laptop_mode
-===========
-
-laptop_mode is a knob that controls "laptop mode". All the things that are
-controlled by this knob are discussed in Documentation/admin-guide/laptops/laptop-mode.rst.
-
-
 legacy_va_layout
 ================
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a29d8ac9d3e3..4bae7c4c664e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -811,9 +811,6 @@ void blk_mq_free_request(struct request *rq)
 
 	blk_mq_finish_request(rq);
 
-	if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
-		laptop_io_completion(q->disk->bdi);
-
 	rq_qos_done(q, rq);
 
 	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0c466ccbed69..15eb463d5a9b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3305,8 +3305,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
 	/*
 	 * We do something simple for now.  The filemap_flush() will
 	 * also start triggering a write of the data blocks, which is
-	 * not strictly speaking necessary (and for users of
-	 * laptop_mode, not even desirable).  However, to do otherwise
+	 * not strictly speaking necessary.  However, to do otherwise
 	 * would require replicating code paths in:
 	 *
 	 * ext4_writepages() ->
diff --git a/fs/sync.c b/fs/sync.c
index 431fc5f5be06..6330150792f6 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -104,8 +104,6 @@ void ksys_sync(void)
 	iterate_supers(sync_fs_one_sb, &wait);
 	sync_bdevs(false);
 	sync_bdevs(true);
-	if (unlikely(laptop_mode))
-		laptop_sync_completion();
 }
 
 SYSCALL_DEFINE0(sync)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index bc71aa9dcee8..a2014fb1bc66 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -845,15 +845,6 @@ xfs_fs_sync_fs(
 	if (error)
 		return error;
 
-	if (laptop_mode) {
-		/*
-		 * The disk must be active because we're syncing.
-		 * We schedule log work now (now that the disk is
-		 * active) instead of later (when it might not be).
-		 */
-		flush_delayed_work(&mp->m_log->l_work);
-	}
-
 	/*
 	 * If we are called with page faults frozen out, it means we are about
 	 * to freeze the transaction subsystem. Take the opportunity to shut
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 0217c1073735..c88fd4d37d1f 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -46,7 +46,6 @@ enum wb_reason {
 	WB_REASON_VMSCAN,
 	WB_REASON_SYNC,
 	WB_REASON_PERIODIC,
-	WB_REASON_LAPTOP_TIMER,
 	WB_REASON_FS_FREE_SPACE,
 	/*
 	 * There is no bdi forker thread any more and works are done
@@ -204,8 +203,6 @@ struct backing_dev_info {
 	char dev_name[64];
 	struct device *owner;
 
-	struct timer_list laptop_mode_wb_timer;
-
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *debug_dir;
 #endif
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index f48e8ccffe81..e530112c4b3a 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -328,9 +328,6 @@ struct dirty_throttle_control {
 	bool			dirty_exceeded;
 };
 
-void laptop_io_completion(struct backing_dev_info *info);
-void laptop_sync_completion(void);
-void laptop_mode_timer_fn(struct timer_list *t);
 bool node_dirty_ok(struct pglist_data *pgdat);
 int wb_domain_init(struct wb_domain *dom, gfp_t gfp);
 #ifdef CONFIG_CGROUP_WRITEBACK
@@ -342,7 +339,6 @@ extern struct wb_domain global_wb_domain;
 /* These are exported to sysctl. */
 extern unsigned int dirty_writeback_interval;
 extern unsigned int dirty_expire_interval;
-extern int laptop_mode;
 
 void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
 unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 311a341e6fe4..b6f94e97788a 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -42,7 +42,6 @@
 	EM( WB_REASON_VMSCAN,			"vmscan")		\
 	EM( WB_REASON_SYNC,			"sync")			\
 	EM( WB_REASON_PERIODIC,			"periodic")		\
-	EM( WB_REASON_LAPTOP_TIMER,		"laptop_timer")		\
 	EM( WB_REASON_FS_FREE_SPACE,		"fs_free_space")	\
 	EM( WB_REASON_FORKER_THREAD,		"forker_thread")	\
 	EMe(WB_REASON_FOREIGN_FLUSH,		"foreign_flush")
diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h
index 63d1464cb71c..6ea9ea8413fa 100644
--- a/include/uapi/linux/sysctl.h
+++ b/include/uapi/linux/sysctl.h
@@ -183,7 +183,7 @@ enum
 	VM_LOWMEM_RESERVE_RATIO=20,/* reservation ratio for lower memory zones */
 	VM_MIN_FREE_KBYTES=21,	/* Minimum free kilobytes to maintain */
 	VM_MAX_MAP_COUNT=22,	/* int: Maximum number of mmaps/address-space */
-	VM_LAPTOP_MODE=23,	/* vm laptop mode */
+
 	VM_BLOCK_DUMP=24,	/* block dump mode */
 	VM_HUGETLB_GROUP=25,	/* permitted hugetlb group */
 	VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c5740c6d37a2..a0e26d1b717f 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1034,7 +1034,6 @@ struct backing_dev_info *bdi_alloc(int node_id)
 	bdi->capabilities = BDI_CAP_WRITEBACK;
 	bdi->ra_pages = VM_READAHEAD_PAGES;
 	bdi->io_pages = VM_READAHEAD_PAGES;
-	timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0);
 	return bdi;
 }
 EXPORT_SYMBOL(bdi_alloc);
@@ -1156,8 +1155,6 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
 
 void bdi_unregister(struct backing_dev_info *bdi)
 {
-	timer_delete_sync(&bdi->laptop_mode_wb_timer);
-
 	/* make sure nobody finds us on the bdi_list anymore */
 	bdi_remove_from_list(bdi);
 	wb_shutdown(&bdi->wb);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index ccdeb0e84d39..601a5e048d12 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -109,14 +109,6 @@ EXPORT_SYMBOL_GPL(dirty_writeback_interval);
  */
 unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
 
-/*
- * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
- * a full sync is triggered after this time elapses without any disk activity.
- */
-int laptop_mode;
-
-EXPORT_SYMBOL(laptop_mode);
-
 /* End of sysctl-exported parameters */
 
 struct wb_domain global_wb_domain;
@@ -1843,17 +1835,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
 			balance_domain_limits(mdtc, strictlimit);
 		}
 
-		/*
-		 * In laptop mode, we wait until hitting the higher threshold
-		 * before starting background writeout, and then write out all
-		 * the way down to the lower threshold.  So slow writers cause
-		 * minimal disk activity.
-		 *
-		 * In normal mode, we start background writeout at the lower
-		 * background_thresh, to keep the amount of dirty memory low.
-		 */
-		if (!laptop_mode && nr_dirty > gdtc->bg_thresh &&
-		    !writeback_in_progress(wb))
+		if (nr_dirty > gdtc->bg_thresh && !writeback_in_progress(wb))
 			wb_start_background_writeback(wb);
 
 		/*
@@ -1876,10 +1858,6 @@ free_running:
 			break;
 		}
 
-		/* Start writeback even when in laptop mode */
-		if (unlikely(!writeback_in_progress(wb)))
-			wb_start_background_writeback(wb);
-
 		mem_cgroup_flush_foreign(wb);
 
 		/*
@@ -2198,41 +2176,6 @@ static int dirty_writeback_centisecs_handler(const struct ctl_table *table, int
 }
 #endif
 
-void laptop_mode_timer_fn(struct timer_list *t)
-{
-	struct backing_dev_info *backing_dev_info =
-		timer_container_of(backing_dev_info, t, laptop_mode_wb_timer);
-
-	wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER);
-}
-
-/*
- * We've spun up the disk and we're in laptop mode: schedule writeback
- * of all dirty data a few seconds from now.  If the flush is already scheduled
- * then push it back - the user is still using the disk.
- */
-void laptop_io_completion(struct backing_dev_info *info)
-{
-	mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
-}
-
-/*
- * We're in laptop mode and we've just synced. The sync's writes will have
- * caused another writeback to be scheduled by laptop_io_completion.
- * Nothing needs to be written back anymore, so we unschedule the writeback.
- */
-void laptop_sync_completion(void)
-{
-	struct backing_dev_info *bdi;
-
-	rcu_read_lock();
-
-	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
-		timer_delete(&bdi->laptop_mode_wb_timer);
-
-	rcu_read_unlock();
-}
-
 /*
  * If ratelimit_pages is too high then we can get into dirty-data overload
  * if a large number of processes all perform writes at the same time.
@@ -2263,6 +2206,19 @@ static int page_writeback_cpu_online(unsigned int cpu)
 
 #ifdef CONFIG_SYSCTL
 
+static int laptop_mode;
+static int laptop_mode_handler(const struct ctl_table *table, int write,
+			       void *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret = proc_dointvec_jiffies(table, write, buffer, lenp, ppos);
+
+	if (!ret && write)
+		pr_warn("%s: vm.laptop_mode is deprecated. Ignoring setting.\n",
+			current->comm);
+
+	return ret;
+}
+
 /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
 static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
 
@@ -2332,7 +2288,7 @@ static const struct ctl_table vm_page_writeback_sysctls[] = {
 		.data		= &laptop_mode,
 		.maxlen		= sizeof(laptop_mode),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_jiffies,
+		.proc_handler	= laptop_mode_handler,
 	},
 };
 #endif
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1c87945fa761..fc5691afb998 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -104,13 +104,13 @@ struct scan_control {
 	unsigned int force_deactivate:1;
 	unsigned int skipped_deactivate:1;
 
-	/* Writepage batching in laptop mode; RECLAIM_WRITE */
+	/* zone_reclaim_mode, boost reclaim */
 	unsigned int may_writepage:1;
 
-	/* Can mapped folios be reclaimed? */
+	/* zone_reclaim_mode */
 	unsigned int may_unmap:1;
 
-	/* Can folios be swapped as part of reclaim? */
+	/* zome_reclaim_mode, boost reclaim, cgroup restrictions */
 	unsigned int may_swap:1;
 
 	/* Not allow cache_trim_mode to be turned on as part of reclaim? */
@@ -6365,13 +6365,6 @@ retry:
 
 		if (sc->compaction_ready)
 			break;
-
-		/*
-		 * If we're getting trouble reclaiming, start doing
-		 * writepage even in laptop mode.
-		 */
-		if (sc->priority < DEF_PRIORITY - 2)
-			sc->may_writepage = 1;
 	} while (--sc->priority >= 0);
 
 	last_pgdat = NULL;
@@ -6580,7 +6573,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 		.order = order,
 		.nodemask = nodemask,
 		.priority = DEF_PRIORITY,
-		.may_writepage = !laptop_mode,
+		.may_writepage = 1,
 		.may_unmap = 1,
 		.may_swap = 1,
 	};
@@ -6624,7 +6617,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
 	struct scan_control sc = {
 		.nr_to_reclaim = SWAP_CLUSTER_MAX,
 		.target_mem_cgroup = memcg,
-		.may_writepage = !laptop_mode,
+		.may_writepage = 1,
 		.may_unmap = 1,
 		.reclaim_idx = MAX_NR_ZONES - 1,
 		.may_swap = !noswap,
@@ -6670,7 +6663,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 		.reclaim_idx = MAX_NR_ZONES - 1,
 		.target_mem_cgroup = memcg,
 		.priority = DEF_PRIORITY,
-		.may_writepage = !laptop_mode,
+		.may_writepage = 1,
 		.may_unmap = 1,
 		.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
 		.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
@@ -7051,7 +7044,7 @@ restart:
 		 * from reclaim context. If no pages are reclaimed, the
 		 * reclaim will be aborted.
 		 */
-		sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
+		sc.may_writepage = !nr_boost_reclaim;
 		sc.may_swap = !nr_boost_reclaim;
 
 		/*
@@ -7061,13 +7054,6 @@ restart:
 		 */
 		kswapd_age_node(pgdat, &sc);
 
-		/*
-		 * If we're getting trouble reclaiming, start doing writepage
-		 * even in laptop mode.
-		 */
-		if (sc.priority < DEF_PRIORITY - 2)
-			sc.may_writepage = 1;
-
 		/* Call soft limit reclaim before calling shrink_node. */
 		sc.nr_scanned = 0;
 		nr_soft_scanned = 0;
@@ -7799,7 +7785,7 @@ int user_proactive_reclaim(char *buf,
 				.reclaim_idx = gfp_zone(gfp_mask),
 				.proactive_swappiness = swappiness == -1 ? NULL : &swappiness,
 				.priority = DEF_PRIORITY,
-				.may_writepage = !laptop_mode,
+				.may_writepage = 1,
 				.nr_to_reclaim = max(batch_size, SWAP_CLUSTER_MAX),
 				.may_unmap = 1,
 				.may_swap = 1,
-- 
cgit v1.2.3


From 5173ae0a068d64643ccf4915b7cbedf82810a592 Mon Sep 17 00:00:00 2001
From: Shivank Garg <shivankg@amd.com>
Date: Sun, 18 Jan 2026 19:09:40 +0000
Subject: mm/khugepaged: map dirty/writeback pages failures to EAGAIN

Patch series "mm/khugepaged: fix dirty page handling for MADV_COLLAPSE",
v5.

MADV_COLLAPSE on file-backed mappings fails with -EINVAL when TEXT pages
are dirty. This affects scenarios like package/container updates or
executing binaries immediately after writing them, etc.

The issue is that collapse_file() triggers async writeback and returns
SCAN_FAIL (maps to -EINVAL), expecting khugepaged to revisit later. But
MADV_COLLAPSE is synchronous and userspace expects immediate success or
a clear retry signal.

Reproduction:
 - Compile or copy 2MB-aligned executable to XFS/ext4 FS
 - Call MADV_COLLAPSE on .text section
 - First call fails with -EINVAL (text pages dirty from copy)
 - Second call succeeds (async writeback completed)

Issue Report:
https://lore.kernel.org/all/4e26fe5e-7374-467c-a333-9dd48f85d7cc@amd.com


This patch (of 2):

When collapse_file encounters dirty or writeback pages in file-backed
mappings, it currently returns SCAN_FAIL which maps to -EINVAL.  This is
misleading as EINVAL suggests invalid arguments, whereas dirty/writeback
pages represent transient conditions that may resolve on retry.

Introduce SCAN_PAGE_DIRTY_OR_WRITEBACK to cover both dirty and writeback
states, mapping it to -EAGAIN.  For MADV_COLLAPSE, this provides userspace
with a clear signal that retry may succeed after writeback completes.  For
khugepaged, this is harmless as it will naturally revisit the range during
periodic scans after async writeback completes.

Link: https://lkml.kernel.org/r/20260118190939.8986-2-shivankg@amd.com
Link: https://lkml.kernel.org/r/20260118190939.8986-4-shivankg@amd.com
Fixes: 34488399fa08 ("mm/madvise: add file and shmem support to MADV_COLLAPSE")
Signed-off-by: Shivank Garg <shivankg@amd.com>
Reported-by: Branden Moore <Branden.Moore@amd.com>
Closes: https://lore.kernel.org/all/4e26fe5e-7374-467c-a333-9dd48f85d7cc@amd.com
Reviewed-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: wang lian <lianux.mm@gmail.com>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Cc: Barry Song <baohua@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Zach O'Keefe <zokeefe@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/huge_memory.h | 3 ++-
 mm/khugepaged.c                    | 8 +++++---
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'include/trace')

diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index 4cde53b45a85..4e41bff31888 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -37,7 +37,8 @@
 	EM( SCAN_PAGE_HAS_PRIVATE,	"page_has_private")		\
 	EM( SCAN_STORE_FAILED,		"store_failed")			\
 	EM( SCAN_COPY_MC,		"copy_poisoned_page")		\
-	EMe(SCAN_PAGE_FILLED,		"page_filled")
+	EM( SCAN_PAGE_FILLED,		"page_filled")			\
+	EMe(SCAN_PAGE_DIRTY_OR_WRITEBACK, "page_dirty_or_writeback")
 
 #undef EM
 #undef EMe
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 97d1b2824386..219dfa2e523c 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -58,6 +58,7 @@ enum scan_result {
 	SCAN_STORE_FAILED,
 	SCAN_COPY_MC,
 	SCAN_PAGE_FILLED,
+	SCAN_PAGE_DIRTY_OR_WRITEBACK,
 };
 
 #define CREATE_TRACE_POINTS
@@ -1967,11 +1968,11 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
 				 */
 				xas_unlock_irq(&xas);
 				filemap_flush(mapping);
-				result = SCAN_FAIL;
+				result = SCAN_PAGE_DIRTY_OR_WRITEBACK;
 				goto xa_unlocked;
 			} else if (folio_test_writeback(folio)) {
 				xas_unlock_irq(&xas);
-				result = SCAN_FAIL;
+				result = SCAN_PAGE_DIRTY_OR_WRITEBACK;
 				goto xa_unlocked;
 			} else if (folio_trylock(folio)) {
 				folio_get(folio);
@@ -2018,7 +2019,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
 			 * folio is dirty because it hasn't been flushed
 			 * since first write.
 			 */
-			result = SCAN_FAIL;
+			result = SCAN_PAGE_DIRTY_OR_WRITEBACK;
 			goto out_unlock;
 		}
 
@@ -2747,6 +2748,7 @@ static int madvise_collapse_errno(enum scan_result r)
 	case SCAN_PAGE_LRU:
 	case SCAN_DEL_PAGE_LRU:
 	case SCAN_PAGE_FILLED:
+	case SCAN_PAGE_DIRTY_OR_WRITEBACK:
 		return -EAGAIN;
 	/*
 	 * Other: Trying again likely not to succeed / error intrinsic to
-- 
cgit v1.2.3


From a45088376d8a847a5e3b1982fcfceb41644e3b1d Mon Sep 17 00:00:00 2001
From: Jiayuan Chen <jiayuan.chen@shopee.com>
Date: Tue, 20 Jan 2026 10:43:49 +0800
Subject: mm/vmscan: add tracepoint and reason for kswapd_failures reset

Currently, kswapd_failures is reset in multiple places (kswapd, direct
reclaim, PCP freeing, memory-tiers), but there's no way to trace when and
why it was reset, making it difficult to debug memory reclaim issues.

This patch:

1. Introduce kswapd_clear_hopeless() as a wrapper function to
   centralize kswapd_failures reset logic.

2. Introduce kswapd_test_hopeless() to encapsulate hopeless node
   checks, replacing all open-coded kswapd_failures comparisons.

3. Add kswapd_clear_hopeless_reason enum to distinguish reset sources:
   - KSWAPD_CLEAR_HOPELESS_KSWAPD: reset from kswapd context
   - KSWAPD_CLEAR_HOPELESS_DIRECT: reset from direct reclaim
   - KSWAPD_CLEAR_HOPELESS_PCP: reset from PCP page freeing
   - KSWAPD_CLEAR_HOPELESS_OTHER: reset from other paths

4. Add tracepoints for better observability:
   - mm_vmscan_kswapd_clear_hopeless: traces each reset with reason
   - mm_vmscan_kswapd_reclaim_fail: traces each kswapd reclaim failure

Test results:

$ trace-cmd record -e vmscan:mm_vmscan_kswapd_clear_hopeless -e vmscan:mm_vmscan_kswapd_reclaim_fail
$ # generate memory pressure
$ trace-cmd report
cpus=4
 kswapd0-71    [000]    27.216563: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=1
 kswapd0-71    [000]    27.217169: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=2
 kswapd0-71    [000]    27.217764: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=3
 kswapd0-71    [000]    27.218353: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=4
 kswapd0-71    [000]    27.218993: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=5
 kswapd0-71    [000]    27.219744: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=6
 kswapd0-71    [000]    27.220488: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=7
 kswapd0-71    [000]    27.221206: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=8
 kswapd0-71    [000]    27.221806: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=9
 kswapd0-71    [000]    27.222634: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=10
 kswapd0-71    [000]    27.223286: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=11
 kswapd0-71    [000]    27.223894: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=12
 kswapd0-71    [000]    27.224712: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=13
 kswapd0-71    [000]    27.225424: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=14
 kswapd0-71    [000]    27.226082: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=15
 kswapd0-71    [000]    27.226810: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=16
 kswapd1-72    [002]    27.386869: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=1
 kswapd1-72    [002]    27.387435: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=2
 kswapd1-72    [002]    27.388016: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=3
 kswapd1-72    [002]    27.388586: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=4
 kswapd1-72    [002]    27.389155: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=5
 kswapd1-72    [002]    27.389723: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=6
 kswapd1-72    [002]    27.390292: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=7
 kswapd1-72    [002]    27.392364: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=8
 kswapd1-72    [002]    27.392934: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=9
 kswapd1-72    [002]    27.393504: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=10
 kswapd1-72    [002]    27.394073: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=11
 kswapd1-72    [002]    27.394899: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=12
 kswapd1-72    [002]    27.395472: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=13
 kswapd1-72    [002]    27.396055: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=14
 kswapd1-72    [002]    27.396628: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=15
 kswapd1-72    [002]    27.397199: mm_vmscan_kswapd_reclaim_fail: nid=1 failures=16
kworker/u18:0-40    [002]    27.410151: mm_vmscan_kswapd_clear_hopeless: nid=0 reason=DIRECT
 kswapd0-71    [000]    27.439454: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=1
 kswapd0-71    [000]    27.440048: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=2
 kswapd0-71    [000]    27.440634: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=3
 kswapd0-71    [000]    27.441211: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=4
 kswapd0-71    [000]    27.441787: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=5
 kswapd0-71    [000]    27.442363: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=6
 kswapd0-71    [000]    27.443030: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=7
 kswapd0-71    [000]    27.443725: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=8
 kswapd0-71    [000]    27.444315: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=9
 kswapd0-71    [000]    27.444898: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=10
 kswapd0-71    [000]    27.445476: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=11
 kswapd0-71    [000]    27.446053: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=12
 kswapd0-71    [000]    27.446646: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=13
 kswapd0-71    [000]    27.447230: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=14
 kswapd0-71    [000]    27.447812: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=15
 kswapd0-71    [000]    27.448391: mm_vmscan_kswapd_reclaim_fail: nid=0 failures=16
 ann-423   [003]    28.028285: mm_vmscan_kswapd_clear_hopeless: nid=0 reason=PCP

Link: https://lkml.kernel.org/r/20260120024402.387576-3-jiayuan.chen@linux.dev
Signed-off-by: Jiayuan Chen <jiayuan.chen@shopee.com>
Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>	[tracing]
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: David Hildenbrand <david@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Xu <weixugc@google.com>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h        | 19 ++++++++++++----
 include/trace/events/vmscan.h | 51 +++++++++++++++++++++++++++++++++++++++++++
 mm/memory-tiers.c             |  2 +-
 mm/page_alloc.c               |  4 ++--
 mm/show_mem.c                 |  3 +--
 mm/vmscan.c                   | 29 ++++++++++++++++--------
 mm/vmstat.c                   |  2 +-
 7 files changed, 91 insertions(+), 19 deletions(-)

(limited to 'include/trace')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 8881198e85c6..3e51190a55e4 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1534,16 +1534,27 @@ static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
 #include <linux/memory_hotplug.h>
 
 void build_all_zonelists(pg_data_t *pgdat);
-void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
-		   enum zone_type highest_zoneidx);
-void kswapd_try_clear_hopeless(struct pglist_data *pgdat,
-			       unsigned int order, int highest_zoneidx);
 bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
 			 int highest_zoneidx, unsigned int alloc_flags,
 			 long free_pages);
 bool zone_watermark_ok(struct zone *z, unsigned int order,
 		unsigned long mark, int highest_zoneidx,
 		unsigned int alloc_flags);
+
+enum kswapd_clear_hopeless_reason {
+	KSWAPD_CLEAR_HOPELESS_OTHER = 0,
+	KSWAPD_CLEAR_HOPELESS_KSWAPD,
+	KSWAPD_CLEAR_HOPELESS_DIRECT,
+	KSWAPD_CLEAR_HOPELESS_PCP,
+};
+
+void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
+		   enum zone_type highest_zoneidx);
+void kswapd_try_clear_hopeless(struct pglist_data *pgdat,
+			       unsigned int order, int highest_zoneidx);
+void kswapd_clear_hopeless(pg_data_t *pgdat, enum kswapd_clear_hopeless_reason reason);
+bool kswapd_test_hopeless(pg_data_t *pgdat);
+
 /*
  * Memory initialization context, use to differentiate memory added by
  * the platform statically or via memory hotplug interface.
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 490958fa10de..ea58e4656abf 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -40,6 +40,16 @@
 		{_VMSCAN_THROTTLE_CONGESTED,	"VMSCAN_THROTTLE_CONGESTED"}	\
 		) : "VMSCAN_THROTTLE_NONE"
 
+TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_OTHER);
+TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_KSWAPD);
+TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_DIRECT);
+TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_PCP);
+
+#define kswapd_clear_hopeless_reason_ops		\
+	{KSWAPD_CLEAR_HOPELESS_KSWAPD,	"KSWAPD"},	\
+	{KSWAPD_CLEAR_HOPELESS_DIRECT,	"DIRECT"},	\
+	{KSWAPD_CLEAR_HOPELESS_PCP,	"PCP"},		\
+	{KSWAPD_CLEAR_HOPELESS_OTHER,	"OTHER"}
 
 #define trace_reclaim_flags(file) ( \
 	(file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \
@@ -535,6 +545,47 @@ TRACE_EVENT(mm_vmscan_throttled,
 		__entry->usec_delayed,
 		show_throttle_flags(__entry->reason))
 );
+
+TRACE_EVENT(mm_vmscan_kswapd_reclaim_fail,
+
+	TP_PROTO(int nid, int failures),
+
+	TP_ARGS(nid, failures),
+
+	TP_STRUCT__entry(
+		__field(int, nid)
+		__field(int, failures)
+	),
+
+	TP_fast_assign(
+		__entry->nid = nid;
+		__entry->failures = failures;
+	),
+
+	TP_printk("nid=%d failures=%d",
+		__entry->nid, __entry->failures)
+);
+
+TRACE_EVENT(mm_vmscan_kswapd_clear_hopeless,
+
+	TP_PROTO(int nid, int reason),
+
+	TP_ARGS(nid, reason),
+
+	TP_STRUCT__entry(
+		__field(int, nid)
+		__field(int, reason)
+	),
+
+	TP_fast_assign(
+		__entry->nid = nid;
+		__entry->reason = reason;
+	),
+
+	TP_printk("nid=%d reason=%s",
+		__entry->nid,
+		__print_symbolic(__entry->reason, kswapd_clear_hopeless_reason_ops))
+);
 #endif /* _TRACE_VMSCAN_H */
 
 /* This part must be outside protection */
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 7ec442776574..0ae8bec86346 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -955,7 +955,7 @@ static ssize_t demotion_enabled_store(struct kobject *kobj,
 		struct pglist_data *pgdat;
 
 		for_each_online_pgdat(pgdat)
-			atomic_set(&pgdat->kswapd_failures, 0);
+			kswapd_clear_hopeless(pgdat, KSWAPD_CLEAR_HOPELESS_OTHER);
 	}
 
 	return count;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e779b18168de..2c70ba9d5cc6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2945,9 +2945,9 @@ static bool free_frozen_page_commit(struct zone *zone,
 		 * 'hopeless node' to stay in that state for a while.  Let
 		 * kswapd work again by resetting kswapd_failures.
 		 */
-		if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES &&
+		if (kswapd_test_hopeless(pgdat) &&
 		    next_memory_node(pgdat->node_id) < MAX_NUMNODES)
-			atomic_set(&pgdat->kswapd_failures, 0);
+			kswapd_clear_hopeless(pgdat, KSWAPD_CLEAR_HOPELESS_PCP);
 	}
 	return ret;
 }
diff --git a/mm/show_mem.c b/mm/show_mem.c
index 3a4b5207635d..24078ac3e6bc 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -278,8 +278,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
 #endif
 			K(node_page_state(pgdat, NR_PAGETABLE)),
 			K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
-			str_yes_no(atomic_read(&pgdat->kswapd_failures) >=
-				   MAX_RECLAIM_RETRIES),
+			str_yes_no(kswapd_test_hopeless(pgdat)),
 			K(node_page_state(pgdat, NR_BALLOON_PAGES)));
 	}
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5d9b1bce6f01..1d281174164e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -506,7 +506,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat)
 	 * If kswapd is disabled, reschedule if necessary but do not
 	 * throttle as the system is likely near OOM.
 	 */
-	if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES)
+	if (kswapd_test_hopeless(pgdat))
 		return true;
 
 	/*
@@ -6437,7 +6437,7 @@ static bool allow_direct_reclaim(pg_data_t *pgdat)
 	int i;
 	bool wmark_ok;
 
-	if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES)
+	if (kswapd_test_hopeless(pgdat))
 		return true;
 
 	for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) {
@@ -6846,7 +6846,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order,
 		wake_up_all(&pgdat->pfmemalloc_wait);
 
 	/* Hopeless node, leave it to direct reclaim */
-	if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES)
+	if (kswapd_test_hopeless(pgdat))
 		return true;
 
 	if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
@@ -7111,8 +7111,11 @@ restart:
 	 * watermark_high at this point. We need to avoid increasing the
 	 * failure count to prevent the kswapd thread from stopping.
 	 */
-	if (!sc.nr_reclaimed && !boosted)
-		atomic_inc(&pgdat->kswapd_failures);
+	if (!sc.nr_reclaimed && !boosted) {
+		int fail_cnt = atomic_inc_return(&pgdat->kswapd_failures);
+		/* kswapd context, low overhead to trace every failure */
+		trace_mm_vmscan_kswapd_reclaim_fail(pgdat->node_id, fail_cnt);
+	}
 
 out:
 	clear_reclaim_active(pgdat, highest_zoneidx);
@@ -7371,7 +7374,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
 		return;
 
 	/* Hopeless node, leave it to direct reclaim if possible */
-	if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES ||
+	if (kswapd_test_hopeless(pgdat) ||
 	    (pgdat_balanced(pgdat, order, highest_zoneidx) &&
 	     !pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
 		/*
@@ -7391,9 +7394,11 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
 	wake_up_interruptible(&pgdat->kswapd_wait);
 }
 
-static void kswapd_clear_hopeless(pg_data_t *pgdat)
+void kswapd_clear_hopeless(pg_data_t *pgdat, enum kswapd_clear_hopeless_reason reason)
 {
-	atomic_set(&pgdat->kswapd_failures, 0);
+	/* Only trace actual resets, not redundant zero-to-zero */
+	if (atomic_xchg(&pgdat->kswapd_failures, 0))
+		trace_mm_vmscan_kswapd_clear_hopeless(pgdat->node_id, reason);
 }
 
 /*
@@ -7406,7 +7411,13 @@ void kswapd_try_clear_hopeless(struct pglist_data *pgdat,
 			       unsigned int order, int highest_zoneidx)
 {
 	if (pgdat_balanced(pgdat, order, highest_zoneidx))
-		kswapd_clear_hopeless(pgdat);
+		kswapd_clear_hopeless(pgdat, current_is_kswapd() ?
+			KSWAPD_CLEAR_HOPELESS_KSWAPD : KSWAPD_CLEAR_HOPELESS_DIRECT);
+}
+
+bool kswapd_test_hopeless(pg_data_t *pgdat)
+{
+	return atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES;
 }
 
 #ifdef CONFIG_HIBERNATION
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 0f64c898f79f..23e176e1d09d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1840,7 +1840,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   "\n  start_pfn:           %lu"
 		   "\n  reserved_highatomic: %lu"
 		   "\n  free_highatomic:     %lu",
-		   atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES,
+		   kswapd_test_hopeless(pgdat),
 		   zone->zone_start_pfn,
 		   zone->nr_reserved_highatomic,
 		   zone->nr_free_highatomic);
-- 
cgit v1.2.3