64 files changed, 4043 insertions, 770 deletions
diff --git a/tools/testing/selftests/cgroup/.gitignore b/tools/testing/selftests/cgroup/.gitignore
index 306ee1b01e72..c4a57e69f749 100644
--- a/tools/testing/selftests/cgroup/.gitignore
+++ b/tools/testing/selftests/cgroup/.gitignore
@@ -5,3 +5,4 @@ test_freezer
 test_kmem
 test_kill
 test_cpu
+wait_inotify
diff --git a/tools/testing/selftests/cgroup/Makefile b/tools/testing/selftests/cgroup/Makefile
index 478217cc1371..3d263747d2ad 100644
--- a/tools/testing/selftests/cgroup/Makefile
+++ b/tools/testing/selftests/cgroup/Makefile
@@ -1,10 +1,11 @@
 # SPDX-License-Identifier: GPL-2.0
 CFLAGS += -Wall -pthread
 
-all:
+all: ${HELPER_PROGS}
 
 TEST_FILES     := with_stress.sh
-TEST_PROGS     := test_stress.sh
+TEST_PROGS     := test_stress.sh test_cpuset_prs.sh
+TEST_GEN_FILES := wait_inotify
 TEST_GEN_PROGS = test_memcontrol
 TEST_GEN_PROGS += test_kmem
 TEST_GEN_PROGS += test_core
diff --git a/tools/testing/selftests/cgroup/config b/tools/testing/selftests/cgroup/config
index 84fe884fad86..97d549ee894f 100644
--- a/tools/testing/selftests/cgroup/config
+++ b/tools/testing/selftests/cgroup/config
@@ -4,5 +4,4 @@ CONFIG_CGROUP_FREEZER=y
 CONFIG_CGROUP_SCHED=y
 CONFIG_MEMCG=y
 CONFIG_MEMCG_KMEM=y
-CONFIG_MEMCG_SWAP=y
 CONFIG_PAGE_COUNTER=y
diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
new file mode 100755
index 000000000000..526d2c42d870
--- /dev/null
+++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
@@ -0,0 +1,674 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test for cpuset v2 partition root state (PRS)
+#
+# The sched verbose flag is set, if available, so that the console log
+# can be examined for the correct setting of scheduling domain.
+#
+
+skip_test() {
+	echo "$1"
+	echo "Test SKIPPED"
+	exit 0
+}
+
+[[ $(id -u) -eq 0 ]] || skip_test "Test must be run as root!"
+
+# Set sched verbose flag, if available
+[[ -d /sys/kernel/debug/sched ]] && echo Y > /sys/kernel/debug/sched/verbose
+
+# Get wait_inotify location
+WAIT_INOTIFY=$(cd $(dirname $0); pwd)/wait_inotify
+
+# Find cgroup v2 mount point
+CGROUP2=$(mount -t cgroup2 | head -1 | awk -e '{print $3}')
+[[ -n "$CGROUP2" ]] || skip_test "Cgroup v2 mount point not found!"
+
+CPUS=$(lscpu | grep "^CPU(s)" | sed -e "s/.*:[[:space:]]*//")
+[[ $CPUS -lt 8 ]] && skip_test "Test needs at least 8 cpus available!"
+
+# Set verbose flag and delay factor
+PROG=$1
+VERBOSE=
+DELAY_FACTOR=1
+while [[ "$1" = -* ]]
+do
+	case "$1" in
+		-v) VERBOSE=1
+		    break
+		    ;;
+		-d) DELAY_FACTOR=$2
+		    shift
+		    break
+		    ;;
+		*)  echo "Usage: $PROG [-v] [-d <delay-factor>"
+		    exit
+		    ;;
+	esac
+	shift
+done
+
+cd $CGROUP2
+echo +cpuset > cgroup.subtree_control
+[[ -d test ]] || mkdir test
+cd test
+
+# Pause in ms
+pause()
+{
+	DELAY=$1
+	LOOP=0
+	while [[ $LOOP -lt $DELAY_FACTOR ]]
+	do
+		sleep $DELAY
+		((LOOP++))
+	done
+	return 0
+}
+
+console_msg()
+{
+	MSG=$1
+	echo "$MSG"
+	echo "" > /dev/console
+	echo "$MSG" > /dev/console
+	pause 0.01
+}
+
+test_partition()
+{
+	EXPECTED_VAL=$1
+	echo $EXPECTED_VAL > cpuset.cpus.partition
+	[[ $? -eq 0 ]] || exit 1
+	ACTUAL_VAL=$(cat cpuset.cpus.partition)
+	[[ $ACTUAL_VAL != $EXPECTED_VAL ]] && {
+		echo "cpuset.cpus.partition: expect $EXPECTED_VAL, found $EXPECTED_VAL"
+		echo "Test FAILED"
+		exit 1
+	}
+}
+
+test_effective_cpus()
+{
+	EXPECTED_VAL=$1
+	ACTUAL_VAL=$(cat cpuset.cpus.effective)
+	[[ "$ACTUAL_VAL" != "$EXPECTED_VAL" ]] && {
+		echo "cpuset.cpus.effective: expect '$EXPECTED_VAL', found '$EXPECTED_VAL'"
+		echo "Test FAILED"
+		exit 1
+	}
+}
+
+# Adding current process to cgroup.procs as a test
+test_add_proc()
+{
+	OUTSTR="$1"
+	ERRMSG=$((echo $$ > cgroup.procs) |& cat)
+	echo $ERRMSG | grep -q "$OUTSTR"
+	[[ $? -ne 0 ]] && {
+		echo "cgroup.procs: expect '$OUTSTR', got '$ERRMSG'"
+		echo "Test FAILED"
+		exit 1
+	}
+	echo $$ > $CGROUP2/cgroup.procs	# Move out the task
+}
+
+#
+# Testing the new "isolated" partition root type
+#
+test_isolated()
+{
+	echo 2-3 > cpuset.cpus
+	TYPE=$(cat cpuset.cpus.partition)
+	[[ $TYPE = member ]] || echo member > cpuset.cpus.partition
+
+	console_msg "Change from member to root"
+	test_partition root
+
+	console_msg "Change from root to isolated"
+	test_partition isolated
+
+	console_msg "Change from isolated to member"
+	test_partition member
+
+	console_msg "Change from member to isolated"
+	test_partition isolated
+
+	console_msg "Change from isolated to root"
+	test_partition root
+
+	console_msg "Change from root to member"
+	test_partition member
+
+	#
+	# Testing partition root with no cpu
+	#
+	console_msg "Distribute all cpus to child partition"
+	echo +cpuset > cgroup.subtree_control
+	test_partition root
+
+	mkdir A1
+	cd A1
+	echo 2-3 > cpuset.cpus
+	test_partition root
+	test_effective_cpus 2-3
+	cd ..
+	test_effective_cpus ""
+
+	console_msg "Moving task to partition test"
+	test_add_proc "No space left"
+	cd A1
+	test_add_proc ""
+	cd ..
+
+	console_msg "Shrink and expand child partition"
+	cd A1
+	echo 2 > cpuset.cpus
+	cd ..
+	test_effective_cpus 3
+	cd A1
+	echo 2-3 > cpuset.cpus
+	cd ..
+	test_effective_cpus ""
+
+	# Cleaning up
+	console_msg "Cleaning up"
+	echo $$ > $CGROUP2/cgroup.procs
+	[[ -d A1 ]] && rmdir A1
+}
+
+#
+# Cpuset controller state transition test matrix.
+#
+# Cgroup test hierarchy
+#
+# test -- A1 -- A2 -- A3
+#      \- B1
+#
+#  P<v> = set cpus.partition (0:member, 1:root, 2:isolated, -1:root invalid)
+#  C<l> = add cpu-list
+#  S<p> = use prefix in subtree_control
+#  T    = put a task into cgroup
+#  O<c>-<v> = Write <v> to CPU online file of <c>
+#
+SETUP_A123_PARTITIONS="C1-3:P1:S+ C2-3:P1:S+ C3:P1"
+TEST_MATRIX=(
+	# test  old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate
+	# ----  ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------
+	"  S+    C0-1     .      .    C2-3    S+    C4-5     .      .     0 A2:0-1"
+	"  S+    C0-1     .      .    C2-3    P1      .      .      .     0 "
+	"  S+    C0-1     .      .    C2-3   P1:S+ C0-1:P1   .      .     0 "
+	"  S+    C0-1     .      .    C2-3   P1:S+  C1:P1    .      .     0 "
+	"  S+   C0-1:S+   .      .    C2-3     .      .      .     P1     0 "
+	"  S+   C0-1:P1   .      .    C2-3    S+     C1      .      .     0 "
+	"  S+   C0-1:P1   .      .    C2-3    S+    C1:P1    .      .     0 "
+	"  S+   C0-1:P1   .      .    C2-3    S+    C1:P1    .     P1     0 "
+	"  S+   C0-1:P1   .      .    C2-3   C4-5     .      .      .     0 A1:4-5"
+	"  S+   C0-1:P1   .      .    C2-3  S+:C4-5   .      .      .     0 A1:4-5"
+	"  S+    C0-1     .      .   C2-3:P1   .      .      .     C2     0 "
+	"  S+    C0-1     .      .   C2-3:P1   .      .      .    C4-5    0 B1:4-5"
+	"  S+ C0-3:P1:S+ C2-3:P1 .      .      .      .      .      .     0 A1:0-1,A2:2-3"
+	"  S+ C0-3:P1:S+ C2-3:P1 .      .     C1-3    .      .      .     0 A1:1,A2:2-3"
+	"  S+ C2-3:P1:S+  C3:P1  .      .     C3      .      .      .     0 A1:,A2:3 A1:P1,A2:P1"
+	"  S+ C2-3:P1:S+  C3:P1  .      .     C3      P0     .      .     0 A1:3,A2:3 A1:P1,A2:P0"
+	"  S+ C2-3:P1:S+  C2:P1  .      .     C2-4    .      .      .     0 A1:3-4,A2:2"
+	"  S+ C2-3:P1:S+  C3:P1  .      .     C3      .      .     C0-2   0 A1:,B1:0-2 A1:P1,A2:P1"
+	"  S+ $SETUP_A123_PARTITIONS    .     C2-3    .      .      .     0 A1:,A2:2,A3:3 A1:P1,A2:P1,A3:P1"
+
+	# CPU offlining cases:
+	"  S+    C0-1     .      .    C2-3    S+    C4-5     .     O2-0   0 A1:0-1,B1:3"
+	"  S+ C0-3:P1:S+ C2-3:P1 .      .     O2-0    .      .      .     0 A1:0-1,A2:3"
+	"  S+ C0-3:P1:S+ C2-3:P1 .      .     O2-0   O2-1    .      .     0 A1:0-1,A2:2-3"
+	"  S+ C0-3:P1:S+ C2-3:P1 .      .     O1-0    .      .      .     0 A1:0,A2:2-3"
+	"  S+ C0-3:P1:S+ C2-3:P1 .      .     O1-0   O1-1    .      .     0 A1:0-1,A2:2-3"
+	"  S+ C2-3:P1:S+  C3:P1  .      .     O3-0   O3-1    .      .     0 A1:2,A2:3 A1:P1,A2:P1"
+	"  S+ C2-3:P1:S+  C3:P2  .      .     O3-0   O3-1    .      .     0 A1:2,A2:3 A1:P1,A2:P2"
+	"  S+ C2-3:P1:S+  C3:P1  .      .     O2-0   O2-1    .      .     0 A1:2,A2:3 A1:P1,A2:P1"
+	"  S+ C2-3:P1:S+  C3:P2  .      .     O2-0   O2-1    .      .     0 A1:2,A2:3 A1:P1,A2:P2"
+	"  S+ C2-3:P1:S+  C3:P1  .      .     O2-0    .      .      .     0 A1:,A2:3 A1:P1,A2:P1"
+	"  S+ C2-3:P1:S+  C3:P1  .      .     O3-0    .      .      .     0 A1:2,A2: A1:P1,A2:P1"
+	"  S+ C2-3:P1:S+  C3:P1  .      .    T:O2-0   .      .      .     0 A1:3,A2:3 A1:P1,A2:P-1"
+	"  S+ C2-3:P1:S+  C3:P1  .      .      .    T:O3-0   .      .     0 A1:2,A2:2 A1:P1,A2:P-1"
+	"  S+ $SETUP_A123_PARTITIONS    .     O1-0    .      .      .     0 A1:,A2:2,A3:3 A1:P1,A2:P1,A3:P1"
+	"  S+ $SETUP_A123_PARTITIONS    .     O2-0    .      .      .     0 A1:1,A2:,A3:3 A1:P1,A2:P1,A3:P1"
+	"  S+ $SETUP_A123_PARTITIONS    .     O3-0    .      .      .     0 A1:1,A2:2,A3: A1:P1,A2:P1,A3:P1"
+	"  S+ $SETUP_A123_PARTITIONS    .    T:O1-0   .      .      .     0 A1:2-3,A2:2-3,A3:3 A1:P1,A2:P-1,A3:P-1"
+	"  S+ $SETUP_A123_PARTITIONS    .      .    T:O2-0   .      .     0 A1:1,A2:3,A3:3 A1:P1,A2:P1,A3:P-1"
+	"  S+ $SETUP_A123_PARTITIONS    .      .      .    T:O3-0   .     0 A1:1,A2:2,A3:2 A1:P1,A2:P1,A3:P-1"
+	"  S+ $SETUP_A123_PARTITIONS    .    T:O1-0  O1-1    .      .     0 A1:1,A2:2,A3:3 A1:P1,A2:P1,A3:P1"
+	"  S+ $SETUP_A123_PARTITIONS    .      .    T:O2-0  O2-1    .     0 A1:1,A2:2,A3:3 A1:P1,A2:P1,A3:P1"
+	"  S+ $SETUP_A123_PARTITIONS    .      .      .    T:O3-0  O3-1   0 A1:1,A2:2,A3:3 A1:P1,A2:P1,A3:P1"
+	"  S+ $SETUP_A123_PARTITIONS    .    T:O1-0  O2-0   O1-1    .     0 A1:1,A2:,A3:3 A1:P1,A2:P1,A3:P1"
+	"  S+ $SETUP_A123_PARTITIONS    .    T:O1-0  O2-0   O2-1    .     0 A1:2-3,A2:2-3,A3:3 A1:P1,A2:P-1,A3:P-1"
+
+	# test  old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate
+	# ----  ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------
+	#
+	# Incorrect change to cpuset.cpus invalidates partition root
+	#
+	# Adding CPUs to partition root that are not in parent's
+	# cpuset.cpus is allowed, but those extra CPUs are ignored.
+	"  S+ C2-3:P1:S+ C3:P1   .      .      .     C2-4    .      .     0 A1:,A2:2-3 A1:P1,A2:P1"
+
+	# Taking away all CPUs from parent or itself if there are tasks
+	# will make the partition invalid.
+	"  S+ C2-3:P1:S+  C3:P1  .      .      T     C2-3    .      .     0 A1:2-3,A2:2-3 A1:P1,A2:P-1"
+	"  S+ $SETUP_A123_PARTITIONS    .    T:C2-3   .      .      .     0 A1:2-3,A2:2-3,A3:3 A1:P1,A2:P-1,A3:P-1"
+	"  S+ $SETUP_A123_PARTITIONS    . T:C2-3:C1-3 .      .      .     0 A1:1,A2:2,A3:3 A1:P1,A2:P1,A3:P1"
+
+	# Changing a partition root to member makes child partitions invalid
+	"  S+ C2-3:P1:S+  C3:P1  .      .      P0     .      .      .     0 A1:2-3,A2:3 A1:P0,A2:P-1"
+	"  S+ $SETUP_A123_PARTITIONS    .     C2-3    P0     .      .     0 A1:2-3,A2:2-3,A3:3 A1:P1,A2:P0,A3:P-1"
+
+	# cpuset.cpus can contains cpus not in parent's cpuset.cpus as long
+	# as they overlap.
+	"  S+ C2-3:P1:S+  .      .      .      .   C3-4:P1   .      .     0 A1:2,A2:3 A1:P1,A2:P1"
+
+	# Deletion of CPUs distributed to child cgroup is allowed.
+	"  S+ C0-1:P1:S+ C1      .    C2-3   C4-5     .      .      .     0 A1:4-5,A2:4-5"
+
+	# To become a valid partition root, cpuset.cpus must overlap parent's
+	# cpuset.cpus.
+	"  S+   C0-1:P1   .      .    C2-3    S+   C4-5:P1   .      .     0 A1:0-1,A2:0-1 A1:P1,A2:P-1"
+
+	# Enabling partition with child cpusets is allowed
+	"  S+   C0-1:S+  C1      .    C2-3    P1      .      .      .     0 A1:0-1,A2:1 A1:P1"
+
+	# A partition root with non-partition root parent is invalid, but it
+	# can be made valid if its parent becomes a partition root too.
+	"  S+   C0-1:S+  C1      .    C2-3     .      P2     .      .     0 A1:0-1,A2:1 A1:P0,A2:P-2"
+	"  S+   C0-1:S+ C1:P2    .    C2-3     P1     .      .      .     0 A1:0,A2:1 A1:P1,A2:P2"
+
+	# A non-exclusive cpuset.cpus change will invalidate partition and its siblings
+	"  S+   C0-1:P1   .      .    C2-3   C0-2     .      .      .     0 A1:0-2,B1:2-3 A1:P-1,B1:P0"
+	"  S+   C0-1:P1   .      .  P1:C2-3  C0-2   .      .      .     0 A1:0-2,B1:2-3 A1:P-1,B1:P-1"
+	"  S+    C0-1     .      .  P1:C2-3  C0-2   .      .      .     0 A1:0-2,B1:2-3 A1:P0,B1:P-1"
+
+	# test  old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate
+	# ----  ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------
+	# Failure cases:
+
+	# A task cannot be added to a partition with no cpu
+	"  S+ C2-3:P1:S+  C3:P1  .      .    O2-0:T   .      .      .     1 A1:,A2:3 A1:P1,A2:P1"
+)
+
+#
+# Write to the cpu online file
+#  $1 - <c>-<v> where <c> = cpu number, <v> value to be written
+#
+write_cpu_online()
+{
+	CPU=${1%-*}
+	VAL=${1#*-}
+	CPUFILE=//sys/devices/system/cpu/cpu${CPU}/online
+	if [[ $VAL -eq 0 ]]
+	then
+		OFFLINE_CPUS="$OFFLINE_CPUS $CPU"
+	else
+		[[ -n "$OFFLINE_CPUS" ]] && {
+			OFFLINE_CPUS=$(echo $CPU $CPU $OFFLINE_CPUS | fmt -1 |\
+					sort | uniq -u)
+		}
+	fi
+	echo $VAL > $CPUFILE
+	pause 0.01
+}
+
+#
+# Set controller state
+#  $1 - cgroup directory
+#  $2 - state
+#  $3 - showerr
+#
+# The presence of ":" in state means transition from one to the next.
+#
+set_ctrl_state()
+{
+	TMPMSG=/tmp/.msg_$$
+	CGRP=$1
+	STATE=$2
+	SHOWERR=${3}${VERBOSE}
+	CTRL=${CTRL:=$CONTROLLER}
+	HASERR=0
+	REDIRECT="2> $TMPMSG"
+	[[ -z "$STATE" || "$STATE" = '.' ]] && return 0
+
+	rm -f $TMPMSG
+	for CMD in $(echo $STATE | sed -e "s/:/ /g")
+	do
+		TFILE=$CGRP/cgroup.procs
+		SFILE=$CGRP/cgroup.subtree_control
+		PFILE=$CGRP/cpuset.cpus.partition
+		CFILE=$CGRP/cpuset.cpus
+		S=$(expr substr $CMD 1 1)
+		if [[ $S = S ]]
+		then
+			PREFIX=${CMD#?}
+			COMM="echo ${PREFIX}${CTRL} > $SFILE"
+			eval $COMM $REDIRECT
+		elif [[ $S = C ]]
+		then
+			CPUS=${CMD#?}
+			COMM="echo $CPUS > $CFILE"
+			eval $COMM $REDIRECT
+		elif [[ $S = P ]]
+		then
+			VAL=${CMD#?}
+			case $VAL in
+			0)  VAL=member
+			    ;;
+			1)  VAL=root
+			    ;;
+			2)  VAL=isolated
+			    ;;
+			*)
+			    echo "Invalid partition state - $VAL"
+			    exit 1
+			    ;;
+			esac
+			COMM="echo $VAL > $PFILE"
+			eval $COMM $REDIRECT
+		elif [[ $S = O ]]
+		then
+			VAL=${CMD#?}
+			write_cpu_online $VAL
+		elif [[ $S = T ]]
+		then
+			COMM="echo 0 > $TFILE"
+			eval $COMM $REDIRECT
+		fi
+		RET=$?
+		[[ $RET -ne 0 ]] && {
+			[[ -n "$SHOWERR" ]] && {
+				echo "$COMM"
+				cat $TMPMSG
+			}
+			HASERR=1
+		}
+		pause 0.01
+		rm -f $TMPMSG
+	done
+	return $HASERR
+}
+
+set_ctrl_state_noerr()
+{
+	CGRP=$1
+	STATE=$2
+	[[ -d $CGRP ]] || mkdir $CGRP
+	set_ctrl_state $CGRP $STATE 1
+	[[ $? -ne 0 ]] && {
+		echo "ERROR: Failed to set $2 to cgroup $1!"
+		exit 1
+	}
+}
+
+online_cpus()
+{
+	[[ -n "OFFLINE_CPUS" ]] && {
+		for C in $OFFLINE_CPUS
+		do
+			write_cpu_online ${C}-1
+		done
+	}
+}
+
+#
+# Return 1 if the list of effective cpus isn't the same as the initial list.
+#
+reset_cgroup_states()
+{
+	echo 0 > $CGROUP2/cgroup.procs
+	online_cpus
+	rmdir A1/A2/A3 A1/A2 A1 B1 > /dev/null 2>&1
+	set_ctrl_state . S-
+	pause 0.01
+}
+
+dump_states()
+{
+	for DIR in A1 A1/A2 A1/A2/A3 B1
+	do
+		ECPUS=$DIR/cpuset.cpus.effective
+		PRS=$DIR/cpuset.cpus.partition
+		[[ -e $ECPUS ]] && echo "$ECPUS: $(cat $ECPUS)"
+		[[ -e $PRS   ]] && echo "$PRS: $(cat $PRS)"
+	done
+}
+
+#
+# Check effective cpus
+# $1 - check string, format: <cgroup>:<cpu-list>[,<cgroup>:<cpu-list>]*
+#
+check_effective_cpus()
+{
+	CHK_STR=$1
+	for CHK in $(echo $CHK_STR | sed -e "s/,/ /g")
+	do
+		set -- $(echo $CHK | sed -e "s/:/ /g")
+		CGRP=$1
+		CPUS=$2
+		[[ $CGRP = A2 ]] && CGRP=A1/A2
+		[[ $CGRP = A3 ]] && CGRP=A1/A2/A3
+		FILE=$CGRP/cpuset.cpus.effective
+		[[ -e $FILE ]] || return 1
+		[[ $CPUS = $(cat $FILE) ]] || return 1
+	done
+}
+
+#
+# Check cgroup states
+#  $1 - check string, format: <cgroup>:<state>[,<cgroup>:<state>]*
+#
+check_cgroup_states()
+{
+	CHK_STR=$1
+	for CHK in $(echo $CHK_STR | sed -e "s/,/ /g")
+	do
+		set -- $(echo $CHK | sed -e "s/:/ /g")
+		CGRP=$1
+		STATE=$2
+		FILE=
+		EVAL=$(expr substr $STATE 2 2)
+		[[ $CGRP = A2 ]] && CGRP=A1/A2
+		[[ $CGRP = A3 ]] && CGRP=A1/A2/A3
+
+		case $STATE in
+			P*) FILE=$CGRP/cpuset.cpus.partition
+			    ;;
+			*)  echo "Unknown state: $STATE!"
+			    exit 1
+			    ;;
+		esac
+		VAL=$(cat $FILE)
+
+		case "$VAL" in
+			member) VAL=0
+				;;
+			root)	VAL=1
+				;;
+			isolated)
+				VAL=2
+				;;
+			"root invalid"*)
+				VAL=-1
+				;;
+			"isolated invalid"*)
+				VAL=-2
+				;;
+		esac
+		[[ $EVAL != $VAL ]] && return 1
+	done
+	return 0
+}
+
+#
+# Run cpuset state transition test
+#  $1 - test matrix name
+#
+# This test is somewhat fragile as delays (sleep x) are added in various
+# places to make sure state changes are fully propagated before the next
+# action. These delays may need to be adjusted if running in a slower machine.
+#
+run_state_test()
+{
+	TEST=$1
+	CONTROLLER=cpuset
+	CPULIST=0-6
+	I=0
+	eval CNT="\${#$TEST[@]}"
+
+	reset_cgroup_states
+	echo $CPULIST > cpuset.cpus
+	echo root > cpuset.cpus.partition
+	console_msg "Running state transition test ..."
+
+	while [[ $I -lt $CNT ]]
+	do
+		echo "Running test $I ..." > /dev/console
+		eval set -- "\${$TEST[$I]}"
+		ROOT=$1
+		OLD_A1=$2
+		OLD_A2=$3
+		OLD_A3=$4
+		OLD_B1=$5
+		NEW_A1=$6
+		NEW_A2=$7
+		NEW_A3=$8
+		NEW_B1=$9
+		RESULT=${10}
+		ECPUS=${11}
+		STATES=${12}
+
+		set_ctrl_state_noerr .        $ROOT
+		set_ctrl_state_noerr A1       $OLD_A1
+		set_ctrl_state_noerr A1/A2    $OLD_A2
+		set_ctrl_state_noerr A1/A2/A3 $OLD_A3
+		set_ctrl_state_noerr B1       $OLD_B1
+		RETVAL=0
+		set_ctrl_state A1       $NEW_A1; ((RETVAL += $?))
+		set_ctrl_state A1/A2    $NEW_A2; ((RETVAL += $?))
+		set_ctrl_state A1/A2/A3 $NEW_A3; ((RETVAL += $?))
+		set_ctrl_state B1       $NEW_B1; ((RETVAL += $?))
+
+		[[ $RETVAL -ne $RESULT ]] && {
+			echo "Test $TEST[$I] failed result check!"
+			eval echo \"\${$TEST[$I]}\"
+			dump_states
+			online_cpus
+			exit 1
+		}
+
+		[[ -n "$ECPUS" && "$ECPUS" != . ]] && {
+			check_effective_cpus $ECPUS
+			[[ $? -ne 0 ]] && {
+				echo "Test $TEST[$I] failed effective CPU check!"
+				eval echo \"\${$TEST[$I]}\"
+				echo
+				dump_states
+				online_cpus
+				exit 1
+			}
+		}
+
+		[[ -n "$STATES" ]] && {
+			check_cgroup_states $STATES
+			[[ $? -ne 0 ]] && {
+				echo "FAILED: Test $TEST[$I] failed states check!"
+				eval echo \"\${$TEST[$I]}\"
+				echo
+				dump_states
+				online_cpus
+				exit 1
+			}
+		}
+
+		reset_cgroup_states
+		#
+		# Check to see if effective cpu list changes
+		#
+		pause 0.05
+		NEWLIST=$(cat cpuset.cpus.effective)
+		[[ $NEWLIST != $CPULIST ]] && {
+			echo "Effective cpus changed to $NEWLIST after test $I!"
+			exit 1
+		}
+		[[ -n "$VERBOSE" ]] && echo "Test $I done."
+		((I++))
+	done
+	echo "All $I tests of $TEST PASSED."
+
+	echo member > cpuset.cpus.partition
+}
+
+#
+# Wait for inotify event for the given file and read it
+# $1: cgroup file to wait for
+# $2: file to store the read result
+#
+wait_inotify()
+{
+	CGROUP_FILE=$1
+	OUTPUT_FILE=$2
+
+	$WAIT_INOTIFY $CGROUP_FILE
+	cat $CGROUP_FILE > $OUTPUT_FILE
+}
+
+#
+# Test if inotify events are properly generated when going into and out of
+# invalid partition state.
+#
+test_inotify()
+{
+	ERR=0
+	PRS=/tmp/.prs_$$
+	[[ -f $WAIT_INOTIFY ]] || {
+		echo "wait_inotify not found, inotify test SKIPPED."
+		return
+	}
+
+	pause 0.01
+	echo 1 > cpuset.cpus
+	echo 0 > cgroup.procs
+	echo root > cpuset.cpus.partition
+	pause 0.01
+	rm -f $PRS
+	wait_inotify $PWD/cpuset.cpus.partition $PRS &
+	pause 0.01
+	set_ctrl_state . "O1-0"
+	pause 0.01
+	check_cgroup_states ".:P-1"
+	if [[ $? -ne 0 ]]
+	then
+		echo "FAILED: Inotify test - partition not invalid"
+		ERR=1
+	elif [[ ! -f $PRS ]]
+	then
+		echo "FAILED: Inotify test - event not generated"
+		ERR=1
+		kill %1
+	elif [[ $(cat $PRS) != "root invalid"* ]]
+	then
+		echo "FAILED: Inotify test - incorrect state"
+		cat $PRS
+		ERR=1
+	fi
+	online_cpus
+	echo member > cpuset.cpus.partition
+	echo 0 > ../cgroup.procs
+	if [[ $ERR -ne 0 ]]
+	then
+		exit 1
+	else
+		echo "Inotify test PASSED"
+	fi
+}
+
+run_state_test TEST_MATRIX
+test_isolated
+test_inotify
+echo "All tests PASSED."
+cd ..
+rmdir test
diff --git a/tools/testing/selftests/cgroup/wait_inotify.c b/tools/testing/selftests/cgroup/wait_inotify.c
new file mode 100644
index 000000000000..e11b431e1b62
--- /dev/null
+++ b/tools/testing/selftests/cgroup/wait_inotify.c
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Wait until an inotify event on the given cgroup file.
+ */
+#include <linux/limits.h>
+#include <sys/inotify.h>
+#include <sys/mman.h>
+#include <sys/ptrace.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+static const char usage[] = "Usage: %s [-v] <cgroup_file>\n";
+static char *file;
+static int verbose;
+
+static inline void fail_message(char *msg)
+{
+	fprintf(stderr, msg, file);
+	exit(1);
+}
+
+int main(int argc, char *argv[])
+{
+	char *cmd = argv[0];
+	int c, fd;
+	struct pollfd fds = { .events = POLLIN, };
+
+	while ((c = getopt(argc, argv, "v")) != -1) {
+		switch (c) {
+		case 'v':
+			verbose++;
+			break;
+		}
+		argv++, argc--;
+	}
+
+	if (argc != 2) {
+		fprintf(stderr, usage, cmd);
+		return -1;
+	}
+	file = argv[1];
+	fd = open(file, O_RDONLY);
+	if (fd < 0)
+		fail_message("Cgroup file %s not found!\n");
+	close(fd);
+
+	fd = inotify_init();
+	if (fd < 0)
+		fail_message("inotify_init() fails on %s!\n");
+	if (inotify_add_watch(fd, file, IN_MODIFY) < 0)
+		fail_message("inotify_add_watch() fails on %s!\n");
+	fds.fd = fd;
+
+	/*
+	 * poll waiting loop
+	 */
+	for (;;) {
+		int ret = poll(&fds, 1, 10000);
+
+		if (ret < 0) {
+			if (errno == EINTR)
+				continue;
+			perror("poll");
+			exit(1);
+		}
+		if ((ret > 0) && (fds.revents & POLLIN))
+			break;
+	}
+	if (verbose) {
+		struct inotify_event events[10];
+		long len;
+
+		usleep(1000);
+		len = read(fd, events, sizeof(events));
+		printf("Number of events read = %ld\n",
+			len/sizeof(struct inotify_event));
+	}
+	close(fd);
+	return 0;
+}
diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile
index 0470c5f3e690..a1fa2eff8192 100644
--- a/tools/testing/selftests/damon/Makefile
+++ b/tools/testing/selftests/damon/Makefile
@@ -6,6 +6,7 @@ TEST_GEN_FILES += huge_count_read_write
 TEST_FILES = _chk_dependency.sh _debugfs_common.sh
 TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh
 TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh
+TEST_PROGS += debugfs_duplicate_context_creation.sh
 TEST_PROGS += sysfs.sh
 
 include ../lib.mk
diff --git a/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh b/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh
new file mode 100644
index 000000000000..4a76e37ef16b
--- /dev/null
+++ b/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source _debugfs_common.sh
+
+# Test duplicated context creation
+# ================================
+
+if ! echo foo > "$DBGFS/mk_contexts"
+then
+	echo "context creation failed"
+	exit 1
+fi
+
+if echo foo > "$DBGFS/mk_contexts"
+then
+	echo "duplicate context creation success"
+	exit 1
+fi
+
+if ! echo foo > "$DBGFS/rm_contexts"
+then
+	echo "context deletion failed"
+	exit 1
+fi
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc
new file mode 100644
index 000000000000..fc1daac7f066
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc
@@ -0,0 +1,27 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Event probe event parser error log check
+# requires: dynamic_events events/syscalls/sys_enter_openat "<attached-group>.<attached-event> [<args>]":README error_log
+
+check_error() { # command-with-error-pos-by-^
+    ftrace_errlog_check 'event_probe' "$1" 'dynamic_events'
+}
+
+check_error 'e ^a.'			# NO_EVENT_INFO
+check_error 'e ^.b'			# NO_EVENT_INFO
+check_error 'e ^a.b'			# BAD_ATTACH_EVENT
+check_error 'e syscalls/sys_enter_openat ^foo'	# BAD_ATTACH_ARG
+check_error 'e:^/bar syscalls/sys_enter_openat'	# NO_GROUP_NAME
+check_error 'e:^12345678901234567890123456789012345678901234567890123456789012345/bar syscalls/sys_enter_openat'	# GROUP_TOO_LONG
+
+check_error 'e:^foo.1/bar syscalls/sys_enter_openat'	# BAD_GROUP_NAME
+check_error 'e:^ syscalls/sys_enter_openat'		# NO_EVENT_NAME
+check_error 'e:foo/^12345678901234567890123456789012345678901234567890123456789012345 syscalls/sys_enter_openat'	# EVENT_TOO_LONG
+check_error 'e:foo/^bar.1 syscalls/sys_enter_openat'	# BAD_EVENT_NAME
+
+check_error 'e:foo/bar syscalls/sys_enter_openat arg=^dfd'	# BAD_FETCH_ARG
+check_error 'e:foo/bar syscalls/sys_enter_openat ^arg=$foo'	# BAD_ATTACH_ARG
+
+check_error 'e:foo/bar syscalls/sys_enter_openat if ^'	# NO_EP_FILTER
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc
index 3145b0f1835c..8d26d5505808 100644
--- a/tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc
@@ -85,7 +85,7 @@ run_enable_disable() {
 	echo $check_disable > $EVENT_ENABLE
     done
     sleep $SLEEP_TIME
-    echo " make sure it's still works"
+    echo " make sure it still works"
     test_event_enabled $check_enable_star
 
     reset_ftrace_filter
diff --git a/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c b/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c
index f8c43ce8fe66..c6b8f32990c8 100644
--- a/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c
+++ b/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c
@@ -184,7 +184,7 @@ int main(int argc, char *argv[])
 		/*
 		 * If res is non-zero, we either requeued the waiter or hit an
 		 * error, break out and handle it. If it is zero, then the
-		 * signal may have hit before the the waiter was blocked on f1.
+		 * signal may have hit before the waiter was blocked on f1.
 		 * Try again.
 		 */
 		if (res > 0) {
diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index d625a3f83780..2f0d705db9db 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
+/aarch64/aarch32_id_regs
 /aarch64/arch_timer
 /aarch64/debug-exceptions
 /aarch64/get-reg-list
@@ -28,6 +29,7 @@
 /x86_64/max_vcpuid_cap_test
 /x86_64/mmio_warning_test
 /x86_64/monitor_mwait_test
+/x86_64/nested_exceptions_test
 /x86_64/nx_huge_pages_test
 /x86_64/platform_info_test
 /x86_64/pmu_event_filter_test
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 6448cb9f710f..0172eb6cb6ee 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -91,6 +91,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/kvm_clock_test
 TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test
 TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test
 TEST_GEN_PROGS_x86_64 += x86_64/monitor_mwait_test
+TEST_GEN_PROGS_x86_64 += x86_64/nested_exceptions_test
 TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
 TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test
 TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id
@@ -146,6 +147,7 @@ TEST_GEN_PROGS_x86_64 += system_counter_offset_test
 # Compiled outputs used by test targets
 TEST_GEN_PROGS_EXTENDED_x86_64 += x86_64/nx_huge_pages_test
 
+TEST_GEN_PROGS_aarch64 += aarch64/aarch32_id_regs
 TEST_GEN_PROGS_aarch64 += aarch64/arch_timer
 TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions
 TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list
diff --git a/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c b/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c
new file mode 100644
index 000000000000..6f9c1f19c7f6
--- /dev/null
+++ b/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * aarch32_id_regs - Test for ID register behavior on AArch64-only systems
+ *
+ * Copyright (c) 2022 Google LLC.
+ *
+ * Test that KVM handles the AArch64 views of the AArch32 ID registers as RAZ
+ * and WI from userspace.
+ */
+
+#include <stdint.h>
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "test_util.h"
+
+#define BAD_ID_REG_VAL	0x1badc0deul
+
+#define GUEST_ASSERT_REG_RAZ(reg)	GUEST_ASSERT_EQ(read_sysreg_s(reg), 0)
+
+static void guest_main(void)
+{
+	GUEST_ASSERT_REG_RAZ(SYS_ID_PFR0_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_PFR1_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_DFR0_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_AFR0_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR0_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR1_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR2_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR3_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR0_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR1_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR2_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR3_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR4_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR5_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR4_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR6_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_MVFR0_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_MVFR1_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_MVFR2_EL1);
+	GUEST_ASSERT_REG_RAZ(sys_reg(3, 0, 0, 3, 3));
+	GUEST_ASSERT_REG_RAZ(SYS_ID_PFR2_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_DFR1_EL1);
+	GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR5_EL1);
+	GUEST_ASSERT_REG_RAZ(sys_reg(3, 0, 0, 3, 7));
+
+	GUEST_DONE();
+}
+
+static void test_guest_raz(struct kvm_vcpu *vcpu)
+{
+	struct ucall uc;
+
+	vcpu_run(vcpu);
+
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+		break;
+	case UCALL_DONE:
+		break;
+	default:
+		TEST_FAIL("Unexpected ucall: %lu", uc.cmd);
+	}
+}
+
+static uint64_t raz_wi_reg_ids[] = {
+	KVM_ARM64_SYS_REG(SYS_ID_PFR0_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_PFR1_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_DFR0_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_MMFR0_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_MMFR1_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_MMFR2_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_MMFR3_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_ISAR0_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_ISAR1_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_ISAR2_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_ISAR3_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_ISAR4_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_ISAR5_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_MMFR4_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_ISAR6_EL1),
+	KVM_ARM64_SYS_REG(SYS_MVFR0_EL1),
+	KVM_ARM64_SYS_REG(SYS_MVFR1_EL1),
+	KVM_ARM64_SYS_REG(SYS_MVFR2_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_PFR2_EL1),
+	KVM_ARM64_SYS_REG(SYS_ID_MMFR5_EL1),
+};
+
+static void test_user_raz_wi(struct kvm_vcpu *vcpu)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(raz_wi_reg_ids); i++) {
+		uint64_t reg_id = raz_wi_reg_ids[i];
+		uint64_t val;
+
+		vcpu_get_reg(vcpu, reg_id, &val);
+		ASSERT_EQ(val, 0);
+
+		/*
+		 * Expect the ioctl to succeed with no effect on the register
+		 * value.
+		 */
+		vcpu_set_reg(vcpu, reg_id, BAD_ID_REG_VAL);
+
+		vcpu_get_reg(vcpu, reg_id, &val);
+		ASSERT_EQ(val, 0);
+	}
+}
+
+static uint64_t raz_invariant_reg_ids[] = {
+	KVM_ARM64_SYS_REG(SYS_ID_AFR0_EL1),
+	KVM_ARM64_SYS_REG(sys_reg(3, 0, 0, 3, 3)),
+	KVM_ARM64_SYS_REG(SYS_ID_DFR1_EL1),
+	KVM_ARM64_SYS_REG(sys_reg(3, 0, 0, 3, 7)),
+};
+
+static void test_user_raz_invariant(struct kvm_vcpu *vcpu)
+{
+	int i, r;
+
+	for (i = 0; i < ARRAY_SIZE(raz_invariant_reg_ids); i++) {
+		uint64_t reg_id = raz_invariant_reg_ids[i];
+		uint64_t val;
+
+		vcpu_get_reg(vcpu, reg_id, &val);
+		ASSERT_EQ(val, 0);
+
+		r = __vcpu_set_reg(vcpu, reg_id, BAD_ID_REG_VAL);
+		TEST_ASSERT(r < 0 && errno == EINVAL,
+			    "unexpected KVM_SET_ONE_REG error: r=%d, errno=%d", r, errno);
+
+		vcpu_get_reg(vcpu, reg_id, &val);
+		ASSERT_EQ(val, 0);
+	}
+}
+
+
+
+static bool vcpu_aarch64_only(struct kvm_vcpu *vcpu)
+{
+	uint64_t val, el0;
+
+	vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), &val);
+
+	el0 = (val & ARM64_FEATURE_MASK(ID_AA64PFR0_EL0)) >> ID_AA64PFR0_EL0_SHIFT;
+	return el0 == ID_AA64PFR0_ELx_64BIT_ONLY;
+}
+
+int main(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_main);
+
+	TEST_REQUIRE(vcpu_aarch64_only(vcpu));
+
+	ucall_init(vm, NULL);
+
+	test_user_raz_wi(vcpu);
+	test_user_raz_invariant(vcpu);
+	test_guest_raz(vcpu);
+
+	ucall_uninit(vm);
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c
index 2ee35cf9801e..947bd201435c 100644
--- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c
+++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c
@@ -22,6 +22,7 @@
 #define SPSR_SS		(1 << 21)
 
 extern unsigned char sw_bp, sw_bp2, hw_bp, hw_bp2, bp_svc, bp_brk, hw_wp, ss_start;
+extern unsigned char iter_ss_begin, iter_ss_end;
 static volatile uint64_t sw_bp_addr, hw_bp_addr;
 static volatile uint64_t wp_addr, wp_data_addr;
 static volatile uint64_t svc_addr;
@@ -238,6 +239,46 @@ static void guest_svc_handler(struct ex_regs *regs)
 	svc_addr = regs->pc;
 }
 
+enum single_step_op {
+	SINGLE_STEP_ENABLE = 0,
+	SINGLE_STEP_DISABLE = 1,
+};
+
+static void guest_code_ss(int test_cnt)
+{
+	uint64_t i;
+	uint64_t bvr, wvr, w_bvr, w_wvr;
+
+	for (i = 0; i < test_cnt; i++) {
+		/* Bits [1:0] of dbg{b,w}vr are RES0 */
+		w_bvr = i << 2;
+		w_wvr = i << 2;
+
+		/* Enable Single Step execution */
+		GUEST_SYNC(SINGLE_STEP_ENABLE);
+
+		/*
+		 * The userspace will veriry that the pc is as expected during
+		 * single step execution between iter_ss_begin and iter_ss_end.
+		 */
+		asm volatile("iter_ss_begin:nop\n");
+
+		write_sysreg(w_bvr, dbgbvr0_el1);
+		write_sysreg(w_wvr, dbgwvr0_el1);
+		bvr = read_sysreg(dbgbvr0_el1);
+		wvr = read_sysreg(dbgwvr0_el1);
+
+		asm volatile("iter_ss_end:\n");
+
+		/* Disable Single Step execution */
+		GUEST_SYNC(SINGLE_STEP_DISABLE);
+
+		GUEST_ASSERT(bvr == w_bvr);
+		GUEST_ASSERT(wvr == w_wvr);
+	}
+	GUEST_DONE();
+}
+
 static int debug_version(struct kvm_vcpu *vcpu)
 {
 	uint64_t id_aa64dfr0;
@@ -246,7 +287,7 @@ static int debug_version(struct kvm_vcpu *vcpu)
 	return id_aa64dfr0 & 0xf;
 }
 
-int main(int argc, char *argv[])
+static void test_guest_debug_exceptions(void)
 {
 	struct kvm_vcpu *vcpu;
 	struct kvm_vm *vm;
@@ -259,9 +300,6 @@ int main(int argc, char *argv[])
 	vm_init_descriptor_tables(vm);
 	vcpu_init_descriptor_tables(vcpu);
 
-	__TEST_REQUIRE(debug_version(vcpu) >= 6,
-		       "Armv8 debug architecture not supported.");
-
 	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
 				ESR_EC_BRK_INS, guest_sw_bp_handler);
 	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
@@ -294,5 +332,108 @@ int main(int argc, char *argv[])
 
 done:
 	kvm_vm_free(vm);
+}
+
+void test_single_step_from_userspace(int test_cnt)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct ucall uc;
+	struct kvm_run *run;
+	uint64_t pc, cmd;
+	uint64_t test_pc = 0;
+	bool ss_enable = false;
+	struct kvm_guest_debug debug = {};
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code_ss);
+	ucall_init(vm, NULL);
+	run = vcpu->run;
+	vcpu_args_set(vcpu, 1, test_cnt);
+
+	while (1) {
+		vcpu_run(vcpu);
+		if (run->exit_reason != KVM_EXIT_DEBUG) {
+			cmd = get_ucall(vcpu, &uc);
+			if (cmd == UCALL_ABORT) {
+				REPORT_GUEST_ASSERT(uc);
+				/* NOT REACHED */
+			} else if (cmd == UCALL_DONE) {
+				break;
+			}
+
+			TEST_ASSERT(cmd == UCALL_SYNC,
+				    "Unexpected ucall cmd 0x%lx", cmd);
+
+			if (uc.args[1] == SINGLE_STEP_ENABLE) {
+				debug.control = KVM_GUESTDBG_ENABLE |
+						KVM_GUESTDBG_SINGLESTEP;
+				ss_enable = true;
+			} else {
+				debug.control = SINGLE_STEP_DISABLE;
+				ss_enable = false;
+			}
+
+			vcpu_guest_debug_set(vcpu, &debug);
+			continue;
+		}
+
+		TEST_ASSERT(ss_enable, "Unexpected KVM_EXIT_DEBUG");
+
+		/* Check if the current pc is expected. */
+		vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pc), &pc);
+		TEST_ASSERT(!test_pc || pc == test_pc,
+			    "Unexpected pc 0x%lx (expected 0x%lx)",
+			    pc, test_pc);
+
+		/*
+		 * If the current pc is between iter_ss_bgin and
+		 * iter_ss_end, the pc for the next KVM_EXIT_DEBUG should
+		 * be the current pc + 4.
+		 */
+		if ((pc >= (uint64_t)&iter_ss_begin) &&
+		    (pc < (uint64_t)&iter_ss_end))
+			test_pc = pc + 4;
+		else
+			test_pc = 0;
+	}
+
+	kvm_vm_free(vm);
+}
+
+static void help(char *name)
+{
+	puts("");
+	printf("Usage: %s [-h] [-i iterations of the single step test]\n", name);
+	puts("");
+	exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	int opt;
+	int ss_iteration = 10000;
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+	__TEST_REQUIRE(debug_version(vcpu) >= 6,
+		       "Armv8 debug architecture not supported.");
+	kvm_vm_free(vm);
+
+	while ((opt = getopt(argc, argv, "i:")) != -1) {
+		switch (opt) {
+		case 'i':
+			ss_iteration = atoi(optarg);
+			break;
+		case 'h':
+		default:
+			help(argv[0]);
+			break;
+		}
+	}
+
+	test_guest_debug_exceptions();
+	test_single_step_from_userspace(ss_iteration);
+
 	return 0;
 }
diff --git a/tools/testing/selftests/kvm/aarch64/psci_test.c b/tools/testing/selftests/kvm/aarch64/psci_test.c
index f7621f6e938e..e0b9e81a3e09 100644
--- a/tools/testing/selftests/kvm/aarch64/psci_test.c
+++ b/tools/testing/selftests/kvm/aarch64/psci_test.c
@@ -1,12 +1,14 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * psci_cpu_on_test - Test that the observable state of a vCPU targeted by the
- * CPU_ON PSCI call matches what the caller requested.
+ * psci_test - Tests relating to KVM's PSCI implementation.
  *
  * Copyright (c) 2021 Google LLC.
  *
- * This is a regression test for a race between KVM servicing the PSCI call and
- * userspace reading the vCPUs registers.
+ * This test includes:
+ *  - A regression test for a race between KVM servicing the PSCI CPU_ON call
+ *    and userspace reading the targeted vCPU's registers.
+ *  - A test for KVM's handling of PSCI SYSTEM_SUSPEND and the associated
+ *    KVM_SYSTEM_EVENT_SUSPEND UAPI.
  */
 
 #define _GNU_SOURCE
diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c
index 9c883c94d478..b5234d6efbe1 100644
--- a/tools/testing/selftests/kvm/dirty_log_test.c
+++ b/tools/testing/selftests/kvm/dirty_log_test.c
@@ -17,6 +17,7 @@
 #include <linux/bitmap.h>
 #include <linux/bitops.h>
 #include <linux/atomic.h>
+#include <asm/barrier.h>
 
 #include "kvm_util.h"
 #include "test_util.h"
@@ -264,7 +265,8 @@ static void default_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err)
 
 static bool dirty_ring_supported(void)
 {
-	return kvm_has_cap(KVM_CAP_DIRTY_LOG_RING);
+	return (kvm_has_cap(KVM_CAP_DIRTY_LOG_RING) ||
+		kvm_has_cap(KVM_CAP_DIRTY_LOG_RING_ACQ_REL));
 }
 
 static void dirty_ring_create_vm_done(struct kvm_vm *vm)
@@ -279,12 +281,12 @@ static void dirty_ring_create_vm_done(struct kvm_vm *vm)
 
 static inline bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn)
 {
-	return gfn->flags == KVM_DIRTY_GFN_F_DIRTY;
+	return smp_load_acquire(&gfn->flags) == KVM_DIRTY_GFN_F_DIRTY;
 }
 
 static inline void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn)
 {
-	gfn->flags = KVM_DIRTY_GFN_F_RESET;
+	smp_store_release(&gfn->flags, KVM_DIRTY_GFN_F_RESET);
 }
 
 static uint32_t dirty_ring_collect_one(struct kvm_dirty_gfn *dirty_gfns,
diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h
index 24fde97f6121..e42a09cd24a0 100644
--- a/tools/testing/selftests/kvm/include/kvm_util_base.h
+++ b/tools/testing/selftests/kvm/include/kvm_util_base.h
@@ -175,6 +175,10 @@ extern const struct vm_guest_mode_params vm_guest_mode_params[];
 
 int open_path_or_exit(const char *path, int flags);
 int open_kvm_dev_path_or_exit(void);
+
+bool get_kvm_intel_param_bool(const char *param);
+bool get_kvm_amd_param_bool(const char *param);
+
 unsigned int kvm_check_cap(long cap);
 
 static inline bool kvm_has_cap(long cap)
diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h
index 5c5a88180b6c..befc754ce9b3 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -63,8 +63,10 @@ void test_assert(bool exp, const char *exp_str,
 		    #a, #b, #a, (unsigned long) __a, #b, (unsigned long) __b); \
 } while (0)
 
-#define TEST_FAIL(fmt, ...) \
-	TEST_ASSERT(false, fmt, ##__VA_ARGS__)
+#define TEST_FAIL(fmt, ...) do { \
+	TEST_ASSERT(false, fmt, ##__VA_ARGS__); \
+	__builtin_unreachable(); \
+} while (0)
 
 size_t parse_size(const char *size);
 
diff --git a/tools/testing/selftests/kvm/include/x86_64/evmcs.h b/tools/testing/selftests/kvm/include/x86_64/evmcs.h
index 3c9260f8e116..58db74f68af2 100644
--- a/tools/testing/selftests/kvm/include/x86_64/evmcs.h
+++ b/tools/testing/selftests/kvm/include/x86_64/evmcs.h
@@ -203,14 +203,25 @@ struct hv_enlightened_vmcs {
 		u32 reserved:30;
 	} hv_enlightenments_control;
 	u32 hv_vp_id;
-
+	u32 padding32_2;
 	u64 hv_vm_id;
 	u64 partition_assist_page;
 	u64 padding64_4[4];
 	u64 guest_bndcfgs;
-	u64 padding64_5[7];
+	u64 guest_ia32_perf_global_ctrl;
+	u64 guest_ia32_s_cet;
+	u64 guest_ssp;
+	u64 guest_ia32_int_ssp_table_addr;
+	u64 guest_ia32_lbr_ctl;
+	u64 padding64_5[2];
 	u64 xss_exit_bitmap;
-	u64 padding64_6[7];
+	u64 encls_exiting_bitmap;
+	u64 host_ia32_perf_global_ctrl;
+	u64 tsc_multiplier;
+	u64 host_ia32_s_cet;
+	u64 host_ssp;
+	u64 host_ia32_int_ssp_table_addr;
+	u64 padding64_6;
 };
 
 #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE                     0
@@ -656,6 +667,18 @@ static inline int evmcs_vmread(uint64_t encoding, uint64_t *value)
 	case VIRTUAL_PROCESSOR_ID:
 		*value = current_evmcs->virtual_processor_id;
 		break;
+	case HOST_IA32_PERF_GLOBAL_CTRL:
+		*value = current_evmcs->host_ia32_perf_global_ctrl;
+		break;
+	case GUEST_IA32_PERF_GLOBAL_CTRL:
+		*value = current_evmcs->guest_ia32_perf_global_ctrl;
+		break;
+	case ENCLS_EXITING_BITMAP:
+		*value = current_evmcs->encls_exiting_bitmap;
+		break;
+	case TSC_MULTIPLIER:
+		*value = current_evmcs->tsc_multiplier;
+		break;
 	default: return 1;
 	}
 
@@ -1169,6 +1192,22 @@ static inline int evmcs_vmwrite(uint64_t encoding, uint64_t value)
 		current_evmcs->virtual_processor_id = value;
 		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT;
 		break;
+	case HOST_IA32_PERF_GLOBAL_CTRL:
+		current_evmcs->host_ia32_perf_global_ctrl = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1;
+		break;
+	case GUEST_IA32_PERF_GLOBAL_CTRL:
+		current_evmcs->guest_ia32_perf_global_ctrl = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1;
+		break;
+	case ENCLS_EXITING_BITMAP:
+		current_evmcs->encls_exiting_bitmap = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2;
+		break;
+	case TSC_MULTIPLIER:
+		current_evmcs->tsc_multiplier = value;
+		current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2;
+		break;
 	default: return 1;
 	}
 
diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index 0cbc71b7af50..e8ca0d8a6a7e 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -825,6 +825,8 @@ static inline uint8_t wrmsr_safe(uint32_t msr, uint64_t val)
 	return kvm_asm_safe("wrmsr", "a"(val & -1u), "d"(val >> 32), "c"(msr));
 }
 
+bool kvm_is_tdp_enabled(void);
+
 uint64_t vm_get_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu,
 				 uint64_t vaddr);
 void vm_set_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu,
@@ -855,6 +857,8 @@ enum pg_level {
 #define PG_SIZE_1G PG_LEVEL_SIZE(PG_LEVEL_1G)
 
 void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level);
+void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
+		    uint64_t nr_bytes, int level);
 
 /*
  * Basic CPU control in CR0
diff --git a/tools/testing/selftests/kvm/include/x86_64/svm_util.h b/tools/testing/selftests/kvm/include/x86_64/svm_util.h
index a339b537a575..7aee6244ab6a 100644
--- a/tools/testing/selftests/kvm/include/x86_64/svm_util.h
+++ b/tools/testing/selftests/kvm/include/x86_64/svm_util.h
@@ -9,15 +9,12 @@
 #ifndef SELFTEST_KVM_SVM_UTILS_H
 #define SELFTEST_KVM_SVM_UTILS_H
 
+#include <asm/svm.h>
+
 #include <stdint.h>
 #include "svm.h"
 #include "processor.h"
 
-#define SVM_EXIT_EXCP_BASE	0x040
-#define SVM_EXIT_HLT		0x078
-#define SVM_EXIT_MSR		0x07c
-#define SVM_EXIT_VMMCALL	0x081
-
 struct svm_test_data {
 	/* VMCB */
 	struct vmcb *vmcb; /* gva */
diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h
index 790c6d1ecb34..71b290b6469d 100644
--- a/tools/testing/selftests/kvm/include/x86_64/vmx.h
+++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h
@@ -8,6 +8,8 @@
 #ifndef SELFTEST_KVM_VMX_H
 #define SELFTEST_KVM_VMX_H
 
+#include <asm/vmx.h>
+
 #include <stdint.h>
 #include "processor.h"
 #include "apic.h"
@@ -100,55 +102,6 @@
 #define VMX_EPT_VPID_CAP_AD_BITS		0x00200000
 
 #define EXIT_REASON_FAILED_VMENTRY	0x80000000
-#define EXIT_REASON_EXCEPTION_NMI	0
-#define EXIT_REASON_EXTERNAL_INTERRUPT	1
-#define EXIT_REASON_TRIPLE_FAULT	2
-#define EXIT_REASON_INTERRUPT_WINDOW	7
-#define EXIT_REASON_NMI_WINDOW		8
-#define EXIT_REASON_TASK_SWITCH		9
-#define EXIT_REASON_CPUID		10
-#define EXIT_REASON_HLT			12
-#define EXIT_REASON_INVD		13
-#define EXIT_REASON_INVLPG		14
-#define EXIT_REASON_RDPMC		15
-#define EXIT_REASON_RDTSC		16
-#define EXIT_REASON_VMCALL		18
-#define EXIT_REASON_VMCLEAR		19
-#define EXIT_REASON_VMLAUNCH		20
-#define EXIT_REASON_VMPTRLD		21
-#define EXIT_REASON_VMPTRST		22
-#define EXIT_REASON_VMREAD		23
-#define EXIT_REASON_VMRESUME		24
-#define EXIT_REASON_VMWRITE		25
-#define EXIT_REASON_VMOFF		26
-#define EXIT_REASON_VMON		27
-#define EXIT_REASON_CR_ACCESS		28
-#define EXIT_REASON_DR_ACCESS		29
-#define EXIT_REASON_IO_INSTRUCTION	30
-#define EXIT_REASON_MSR_READ		31
-#define EXIT_REASON_MSR_WRITE		32
-#define EXIT_REASON_INVALID_STATE	33
-#define EXIT_REASON_MWAIT_INSTRUCTION	36
-#define EXIT_REASON_MONITOR_INSTRUCTION 39
-#define EXIT_REASON_PAUSE_INSTRUCTION	40
-#define EXIT_REASON_MCE_DURING_VMENTRY	41
-#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
-#define EXIT_REASON_APIC_ACCESS		44
-#define EXIT_REASON_EOI_INDUCED		45
-#define EXIT_REASON_EPT_VIOLATION	48
-#define EXIT_REASON_EPT_MISCONFIG	49
-#define EXIT_REASON_INVEPT		50
-#define EXIT_REASON_RDTSCP		51
-#define EXIT_REASON_PREEMPTION_TIMER	52
-#define EXIT_REASON_INVVPID		53
-#define EXIT_REASON_WBINVD		54
-#define EXIT_REASON_XSETBV		55
-#define EXIT_REASON_APIC_WRITE		56
-#define EXIT_REASON_INVPCID		58
-#define EXIT_REASON_PML_FULL		62
-#define EXIT_REASON_XSAVES		63
-#define EXIT_REASON_XRSTORS		64
-#define LAST_EXIT_REASON		64
 
 enum vmcs_field {
 	VIRTUAL_PROCESSOR_ID		= 0x00000000,
@@ -208,6 +161,8 @@ enum vmcs_field {
 	VMWRITE_BITMAP_HIGH		= 0x00002029,
 	XSS_EXIT_BITMAP			= 0x0000202C,
 	XSS_EXIT_BITMAP_HIGH		= 0x0000202D,
+	ENCLS_EXITING_BITMAP		= 0x0000202E,
+	ENCLS_EXITING_BITMAP_HIGH	= 0x0000202F,
 	TSC_MULTIPLIER			= 0x00002032,
 	TSC_MULTIPLIER_HIGH		= 0x00002033,
 	GUEST_PHYSICAL_ADDRESS		= 0x00002400,
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 9889fe0d8919..f1cb1627161f 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -50,6 +50,45 @@ int open_kvm_dev_path_or_exit(void)
 	return _open_kvm_dev_path_or_exit(O_RDONLY);
 }
 
+static bool get_module_param_bool(const char *module_name, const char *param)
+{
+	const int path_size = 128;
+	char path[path_size];
+	char value;
+	ssize_t r;
+	int fd;
+
+	r = snprintf(path, path_size, "/sys/module/%s/parameters/%s",
+		     module_name, param);
+	TEST_ASSERT(r < path_size,
+		    "Failed to construct sysfs path in %d bytes.", path_size);
+
+	fd = open_path_or_exit(path, O_RDONLY);
+
+	r = read(fd, &value, 1);
+	TEST_ASSERT(r == 1, "read(%s) failed", path);
+
+	r = close(fd);
+	TEST_ASSERT(!r, "close(%s) failed", path);
+
+	if (value == 'Y')
+		return true;
+	else if (value == 'N')
+		return false;
+
+	TEST_FAIL("Unrecognized value '%c' for boolean module param", value);
+}
+
+bool get_kvm_intel_param_bool(const char *param)
+{
+	return get_module_param_bool("kvm_intel", param);
+}
+
+bool get_kvm_amd_param_bool(const char *param)
+{
+	return get_module_param_bool("kvm_amd", param);
+}
+
 /*
  * Capability
  *
@@ -82,7 +121,10 @@ unsigned int kvm_check_cap(long cap)
 
 void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size)
 {
-	vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING, ring_size);
+	if (vm_check_cap(vm, KVM_CAP_DIRTY_LOG_RING_ACQ_REL))
+		vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING_ACQ_REL, ring_size);
+	else
+		vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING, ring_size);
 	vm->dirty_ring_size = ring_size;
 }
 
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index 2e6e61bbe81b..39c4409ef56a 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -111,6 +111,14 @@ static void sregs_dump(FILE *stream, struct kvm_sregs *sregs, uint8_t indent)
 	}
 }
 
+bool kvm_is_tdp_enabled(void)
+{
+	if (is_intel_cpu())
+		return get_kvm_intel_param_bool("ept");
+	else
+		return get_kvm_amd_param_bool("npt");
+}
+
 void virt_arch_pgd_alloc(struct kvm_vm *vm)
 {
 	TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
@@ -214,6 +222,25 @@ void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
 	__virt_pg_map(vm, vaddr, paddr, PG_LEVEL_4K);
 }
 
+void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
+		    uint64_t nr_bytes, int level)
+{
+	uint64_t pg_size = PG_LEVEL_SIZE(level);
+	uint64_t nr_pages = nr_bytes / pg_size;
+	int i;
+
+	TEST_ASSERT(nr_bytes % pg_size == 0,
+		    "Region size not aligned: nr_bytes: 0x%lx, page size: 0x%lx",
+		    nr_bytes, pg_size);
+
+	for (i = 0; i < nr_pages; i++) {
+		__virt_pg_map(vm, vaddr, paddr, level);
+
+		vaddr += pg_size;
+		paddr += pg_size;
+	}
+}
+
 static uint64_t *_vm_get_page_table_entry(struct kvm_vm *vm,
 					  struct kvm_vcpu *vcpu,
 					  uint64_t vaddr)
@@ -1294,20 +1321,9 @@ done:
 /* Returns true if kvm_intel was loaded with unrestricted_guest=1. */
 bool vm_is_unrestricted_guest(struct kvm_vm *vm)
 {
-	char val = 'N';
-	size_t count;
-	FILE *f;
-
 	/* Ensure that a KVM vendor-specific module is loaded. */
 	if (vm == NULL)
 		close(open_kvm_dev_path_or_exit());
 
-	f = fopen("/sys/module/kvm_intel/parameters/unrestricted_guest", "r");
-	if (f) {
-		count = fread(&val, sizeof(char), 1, f);
-		TEST_ASSERT(count == 1, "Unable to read from param file.");
-		fclose(f);
-	}
-
-	return val == 'Y';
+	return get_kvm_intel_param_bool("unrestricted_guest");
 }
diff --git a/tools/testing/selftests/kvm/lib/x86_64/svm.c b/tools/testing/selftests/kvm/lib/x86_64/svm.c
index 6d445886e16c..5495a92dfd5a 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/svm.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/svm.c
@@ -60,18 +60,6 @@ static void vmcb_set_seg(struct vmcb_seg *seg, u16 selector,
 	seg->base = base;
 }
 
-/*
- * Avoid using memset to clear the vmcb, since libc may not be
- * available in L1 (and, even if it is, features that libc memset may
- * want to use, like AVX, may not be enabled).
- */
-static void clear_vmcb(struct vmcb *vmcb)
-{
-	int n = sizeof(*vmcb) / sizeof(u32);
-
-	asm volatile ("rep stosl" : "+c"(n), "+D"(vmcb) : "a"(0) : "memory");
-}
-
 void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_rsp)
 {
 	struct vmcb *vmcb = svm->vmcb;
@@ -88,7 +76,7 @@ void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_r
 	wrmsr(MSR_EFER, efer | EFER_SVME);
 	wrmsr(MSR_VM_HSAVE_PA, svm->save_area_gpa);
 
-	clear_vmcb(vmcb);
+	memset(vmcb, 0, sizeof(*vmcb));
 	asm volatile ("vmsave %0\n\t" : : "a" (vmcb_gpa) : "memory");
 	vmcb_set_seg(&save->es, get_es(), 0, -1U, data_seg_attr);
 	vmcb_set_seg(&save->cs, get_cs(), 0, -1U, code_seg_attr);
diff --git a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c
index e0004bd26536..32f7e09ef67c 100644
--- a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c
+++ b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c
@@ -17,84 +17,70 @@
 /* VMCALL and VMMCALL are both 3-byte opcodes. */
 #define HYPERCALL_INSN_SIZE	3
 
-static bool ud_expected;
+static bool quirk_disabled;
 
 static void guest_ud_handler(struct ex_regs *regs)
 {
-	GUEST_ASSERT(ud_expected);
-	GUEST_DONE();
+	regs->rax = -EFAULT;
+	regs->rip += HYPERCALL_INSN_SIZE;
 }
 
-extern uint8_t svm_hypercall_insn[HYPERCALL_INSN_SIZE];
-static uint64_t svm_do_sched_yield(uint8_t apic_id)
-{
-	uint64_t ret;
+static const uint8_t vmx_vmcall[HYPERCALL_INSN_SIZE]  = { 0x0f, 0x01, 0xc1 };
+static const uint8_t svm_vmmcall[HYPERCALL_INSN_SIZE] = { 0x0f, 0x01, 0xd9 };
 
-	asm volatile("mov %1, %%rax\n\t"
-		     "mov %2, %%rbx\n\t"
-		     "svm_hypercall_insn:\n\t"
-		     "vmmcall\n\t"
-		     "mov %%rax, %0\n\t"
-		     : "=r"(ret)
-		     : "r"((uint64_t)KVM_HC_SCHED_YIELD), "r"((uint64_t)apic_id)
-		     : "rax", "rbx", "memory");
-
-	return ret;
-}
-
-extern uint8_t vmx_hypercall_insn[HYPERCALL_INSN_SIZE];
-static uint64_t vmx_do_sched_yield(uint8_t apic_id)
+extern uint8_t hypercall_insn[HYPERCALL_INSN_SIZE];
+static uint64_t do_sched_yield(uint8_t apic_id)
 {
 	uint64_t ret;
 
-	asm volatile("mov %1, %%rax\n\t"
-		     "mov %2, %%rbx\n\t"
-		     "vmx_hypercall_insn:\n\t"
-		     "vmcall\n\t"
-		     "mov %%rax, %0\n\t"
-		     : "=r"(ret)
-		     : "r"((uint64_t)KVM_HC_SCHED_YIELD), "r"((uint64_t)apic_id)
-		     : "rax", "rbx", "memory");
+	asm volatile("hypercall_insn:\n\t"
+		     ".byte 0xcc,0xcc,0xcc\n\t"
+		     : "=a"(ret)
+		     : "a"((uint64_t)KVM_HC_SCHED_YIELD), "b"((uint64_t)apic_id)
+		     : "memory");
 
 	return ret;
 }
 
 static void guest_main(void)
 {
-	uint8_t *native_hypercall_insn, *hypercall_insn;
-	uint8_t apic_id;
-
-	apic_id = GET_APIC_ID_FIELD(xapic_read_reg(APIC_ID));
+	const uint8_t *native_hypercall_insn;
+	const uint8_t *other_hypercall_insn;
+	uint64_t ret;
 
 	if (is_intel_cpu()) {
-		native_hypercall_insn = vmx_hypercall_insn;
-		hypercall_insn = svm_hypercall_insn;
-		svm_do_sched_yield(apic_id);
+		native_hypercall_insn = vmx_vmcall;
+		other_hypercall_insn  = svm_vmmcall;
 	} else if (is_amd_cpu()) {
-		native_hypercall_insn = svm_hypercall_insn;
-		hypercall_insn = vmx_hypercall_insn;
-		vmx_do_sched_yield(apic_id);
+		native_hypercall_insn = svm_vmmcall;
+		other_hypercall_insn  = vmx_vmcall;
 	} else {
 		GUEST_ASSERT(0);
 		/* unreachable */
 		return;
 	}
 
+	memcpy(hypercall_insn, other_hypercall_insn, HYPERCALL_INSN_SIZE);
+
+	ret = do_sched_yield(GET_APIC_ID_FIELD(xapic_read_reg(APIC_ID)));
+
 	/*
-	 * The hypercall didn't #UD (guest_ud_handler() signals "done" if a #UD
-	 * occurs).  Verify that a #UD is NOT expected and that KVM patched in
-	 * the native hypercall.
+	 * If the quirk is disabled, verify that guest_ud_handler() "returned"
+	 * -EFAULT and that KVM did NOT patch the hypercall.  If the quirk is
+	 * enabled, verify that the hypercall succeeded and that KVM patched in
+	 * the "right" hypercall.
 	 */
-	GUEST_ASSERT(!ud_expected);
-	GUEST_ASSERT(!memcmp(native_hypercall_insn, hypercall_insn, HYPERCALL_INSN_SIZE));
-	GUEST_DONE();
-}
+	if (quirk_disabled) {
+		GUEST_ASSERT(ret == (uint64_t)-EFAULT);
+		GUEST_ASSERT(!memcmp(other_hypercall_insn, hypercall_insn,
+			     HYPERCALL_INSN_SIZE));
+	} else {
+		GUEST_ASSERT(!ret);
+		GUEST_ASSERT(!memcmp(native_hypercall_insn, hypercall_insn,
+			     HYPERCALL_INSN_SIZE));
+	}
 
-static void setup_ud_vector(struct kvm_vcpu *vcpu)
-{
-	vm_init_descriptor_tables(vcpu->vm);
-	vcpu_init_descriptor_tables(vcpu);
-	vm_install_exception_handler(vcpu->vm, UD_VECTOR, guest_ud_handler);
+	GUEST_DONE();
 }
 
 static void enter_guest(struct kvm_vcpu *vcpu)
@@ -117,35 +103,23 @@ static void enter_guest(struct kvm_vcpu *vcpu)
 	}
 }
 
-static void test_fix_hypercall(void)
+static void test_fix_hypercall(bool disable_quirk)
 {
 	struct kvm_vcpu *vcpu;
 	struct kvm_vm *vm;
 
 	vm = vm_create_with_one_vcpu(&vcpu, guest_main);
-	setup_ud_vector(vcpu);
-
-	ud_expected = false;
-	sync_global_to_guest(vm, ud_expected);
-
-	virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA);
-
-	enter_guest(vcpu);
-}
 
-static void test_fix_hypercall_disabled(void)
-{
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_main);
-	setup_ud_vector(vcpu);
+	vm_init_descriptor_tables(vcpu->vm);
+	vcpu_init_descriptor_tables(vcpu);
+	vm_install_exception_handler(vcpu->vm, UD_VECTOR, guest_ud_handler);
 
-	vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2,
-		      KVM_X86_QUIRK_FIX_HYPERCALL_INSN);
+	if (disable_quirk)
+		vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2,
+			      KVM_X86_QUIRK_FIX_HYPERCALL_INSN);
 
-	ud_expected = true;
-	sync_global_to_guest(vm, ud_expected);
+	quirk_disabled = disable_quirk;
+	sync_global_to_guest(vm, quirk_disabled);
 
 	virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA);
 
@@ -156,6 +130,6 @@ int main(void)
 {
 	TEST_REQUIRE(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & KVM_X86_QUIRK_FIX_HYPERCALL_INSN);
 
-	test_fix_hypercall();
-	test_fix_hypercall_disabled();
+	test_fix_hypercall(false);
+	test_fix_hypercall(true);
 }
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c
index 79ab0152d281..05b32e550a80 100644
--- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c
@@ -26,7 +26,8 @@ static inline uint8_t hypercall(u64 control, vm_vaddr_t input_address,
 		     : "=a" (*hv_status),
 		       "+c" (control), "+d" (input_address),
 		       KVM_ASM_SAFE_OUTPUTS(vector)
-		     : [output_address] "r"(output_address)
+		     : [output_address] "r"(output_address),
+		       "a" (-EFAULT)
 		     : "cc", "memory", "r8", KVM_ASM_SAFE_CLOBBERS);
 	return vector;
 }
@@ -81,13 +82,13 @@ static void guest_hcall(vm_vaddr_t pgs_gpa, struct hcall_data *hcall)
 	}
 
 	vector = hypercall(hcall->control, input, output, &res);
-	if (hcall->ud_expected)
+	if (hcall->ud_expected) {
 		GUEST_ASSERT_2(vector == UD_VECTOR, hcall->control, vector);
-	else
+	} else {
 		GUEST_ASSERT_2(!vector, hcall->control, vector);
+		GUEST_ASSERT_2(res == hcall->expect, hcall->expect, res);
+	}
 
-	GUEST_ASSERT_2(!hcall->ud_expected || res == hcall->expect,
-			hcall->expect, res);
 	GUEST_DONE();
 }
 
@@ -507,7 +508,7 @@ static void guest_test_hcalls_access(void)
 		switch (stage) {
 		case 0:
 			feat->eax |= HV_MSR_HYPERCALL_AVAILABLE;
-			hcall->control = 0xdeadbeef;
+			hcall->control = 0xbeef;
 			hcall->expect = HV_STATUS_INVALID_HYPERCALL_CODE;
 			break;
 
diff --git a/tools/testing/selftests/kvm/x86_64/nested_exceptions_test.c b/tools/testing/selftests/kvm/x86_64/nested_exceptions_test.c
new file mode 100644
index 000000000000..ac33835f78f4
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/nested_exceptions_test.c
@@ -0,0 +1,295 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#define _GNU_SOURCE /* for program_invocation_short_name */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+#include "svm_util.h"
+
+#define L2_GUEST_STACK_SIZE 256
+
+/*
+ * Arbitrary, never shoved into KVM/hardware, just need to avoid conflict with
+ * the "real" exceptions used, #SS/#GP/#DF (12/13/8).
+ */
+#define FAKE_TRIPLE_FAULT_VECTOR	0xaa
+
+/* Arbitrary 32-bit error code injected by this test. */
+#define SS_ERROR_CODE 0xdeadbeef
+
+/*
+ * Bit '0' is set on Intel if the exception occurs while delivering a previous
+ * event/exception.  AMD's wording is ambiguous, but presumably the bit is set
+ * if the exception occurs while delivering an external event, e.g. NMI or INTR,
+ * but not for exceptions that occur when delivering other exceptions or
+ * software interrupts.
+ *
+ * Note, Intel's name for it, "External event", is misleading and much more
+ * aligned with AMD's behavior, but the SDM is quite clear on its behavior.
+ */
+#define ERROR_CODE_EXT_FLAG	BIT(0)
+
+/*
+ * Bit '1' is set if the fault occurred when looking up a descriptor in the
+ * IDT, which is the case here as the IDT is empty/NULL.
+ */
+#define ERROR_CODE_IDT_FLAG	BIT(1)
+
+/*
+ * The #GP that occurs when vectoring #SS should show the index into the IDT
+ * for #SS, plus have the "IDT flag" set.
+ */
+#define GP_ERROR_CODE_AMD ((SS_VECTOR * 8) | ERROR_CODE_IDT_FLAG)
+#define GP_ERROR_CODE_INTEL ((SS_VECTOR * 8) | ERROR_CODE_IDT_FLAG | ERROR_CODE_EXT_FLAG)
+
+/*
+ * Intel and AMD both shove '0' into the error code on #DF, regardless of what
+ * led to the double fault.
+ */
+#define DF_ERROR_CODE 0
+
+#define INTERCEPT_SS		(BIT_ULL(SS_VECTOR))
+#define INTERCEPT_SS_DF		(INTERCEPT_SS | BIT_ULL(DF_VECTOR))
+#define INTERCEPT_SS_GP_DF	(INTERCEPT_SS_DF | BIT_ULL(GP_VECTOR))
+
+static void l2_ss_pending_test(void)
+{
+	GUEST_SYNC(SS_VECTOR);
+}
+
+static void l2_ss_injected_gp_test(void)
+{
+	GUEST_SYNC(GP_VECTOR);
+}
+
+static void l2_ss_injected_df_test(void)
+{
+	GUEST_SYNC(DF_VECTOR);
+}
+
+static void l2_ss_injected_tf_test(void)
+{
+	GUEST_SYNC(FAKE_TRIPLE_FAULT_VECTOR);
+}
+
+static void svm_run_l2(struct svm_test_data *svm, void *l2_code, int vector,
+		       uint32_t error_code)
+{
+	struct vmcb *vmcb = svm->vmcb;
+	struct vmcb_control_area *ctrl = &vmcb->control;
+
+	vmcb->save.rip = (u64)l2_code;
+	run_guest(vmcb, svm->vmcb_gpa);
+
+	if (vector == FAKE_TRIPLE_FAULT_VECTOR)
+		return;
+
+	GUEST_ASSERT_EQ(ctrl->exit_code, (SVM_EXIT_EXCP_BASE + vector));
+	GUEST_ASSERT_EQ(ctrl->exit_info_1, error_code);
+}
+
+static void l1_svm_code(struct svm_test_data *svm)
+{
+	struct vmcb_control_area *ctrl = &svm->vmcb->control;
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+
+	generic_svm_setup(svm, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+	svm->vmcb->save.idtr.limit = 0;
+	ctrl->intercept |= BIT_ULL(INTERCEPT_SHUTDOWN);
+
+	ctrl->intercept_exceptions = INTERCEPT_SS_GP_DF;
+	svm_run_l2(svm, l2_ss_pending_test, SS_VECTOR, SS_ERROR_CODE);
+	svm_run_l2(svm, l2_ss_injected_gp_test, GP_VECTOR, GP_ERROR_CODE_AMD);
+
+	ctrl->intercept_exceptions = INTERCEPT_SS_DF;
+	svm_run_l2(svm, l2_ss_injected_df_test, DF_VECTOR, DF_ERROR_CODE);
+
+	ctrl->intercept_exceptions = INTERCEPT_SS;
+	svm_run_l2(svm, l2_ss_injected_tf_test, FAKE_TRIPLE_FAULT_VECTOR, 0);
+	GUEST_ASSERT_EQ(ctrl->exit_code, SVM_EXIT_SHUTDOWN);
+
+	GUEST_DONE();
+}
+
+static void vmx_run_l2(void *l2_code, int vector, uint32_t error_code)
+{
+	GUEST_ASSERT(!vmwrite(GUEST_RIP, (u64)l2_code));
+
+	GUEST_ASSERT_EQ(vector == SS_VECTOR ? vmlaunch() : vmresume(), 0);
+
+	if (vector == FAKE_TRIPLE_FAULT_VECTOR)
+		return;
+
+	GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_EXCEPTION_NMI);
+	GUEST_ASSERT_EQ((vmreadz(VM_EXIT_INTR_INFO) & 0xff), vector);
+	GUEST_ASSERT_EQ(vmreadz(VM_EXIT_INTR_ERROR_CODE), error_code);
+}
+
+static void l1_vmx_code(struct vmx_pages *vmx)
+{
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+
+	GUEST_ASSERT_EQ(prepare_for_vmx_operation(vmx), true);
+
+	GUEST_ASSERT_EQ(load_vmcs(vmx), true);
+
+	prepare_vmcs(vmx, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+	GUEST_ASSERT_EQ(vmwrite(GUEST_IDTR_LIMIT, 0), 0);
+
+	/*
+	 * VMX disallows injecting an exception with error_code[31:16] != 0,
+	 * and hardware will never generate a VM-Exit with bits 31:16 set.
+	 * KVM should likewise truncate the "bad" userspace value.
+	 */
+	GUEST_ASSERT_EQ(vmwrite(EXCEPTION_BITMAP, INTERCEPT_SS_GP_DF), 0);
+	vmx_run_l2(l2_ss_pending_test, SS_VECTOR, (u16)SS_ERROR_CODE);
+	vmx_run_l2(l2_ss_injected_gp_test, GP_VECTOR, GP_ERROR_CODE_INTEL);
+
+	GUEST_ASSERT_EQ(vmwrite(EXCEPTION_BITMAP, INTERCEPT_SS_DF), 0);
+	vmx_run_l2(l2_ss_injected_df_test, DF_VECTOR, DF_ERROR_CODE);
+
+	GUEST_ASSERT_EQ(vmwrite(EXCEPTION_BITMAP, INTERCEPT_SS), 0);
+	vmx_run_l2(l2_ss_injected_tf_test, FAKE_TRIPLE_FAULT_VECTOR, 0);
+	GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_TRIPLE_FAULT);
+
+	GUEST_DONE();
+}
+
+static void __attribute__((__flatten__)) l1_guest_code(void *test_data)
+{
+	if (this_cpu_has(X86_FEATURE_SVM))
+		l1_svm_code(test_data);
+	else
+		l1_vmx_code(test_data);
+}
+
+static void assert_ucall_vector(struct kvm_vcpu *vcpu, int vector)
+{
+	struct kvm_run *run = vcpu->run;
+	struct ucall uc;
+
+	TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+		    "Unexpected exit reason: %u (%s),\n",
+		    run->exit_reason, exit_reason_str(run->exit_reason));
+
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_SYNC:
+		TEST_ASSERT(vector == uc.args[1],
+			    "Expected L2 to ask for %d, got %ld", vector, uc.args[1]);
+		break;
+	case UCALL_DONE:
+		TEST_ASSERT(vector == -1,
+			    "Expected L2 to ask for %d, L2 says it's done", vector);
+		break;
+	case UCALL_ABORT:
+		TEST_FAIL("%s at %s:%ld (0x%lx != 0x%lx)",
+			  (const char *)uc.args[0], __FILE__, uc.args[1],
+			  uc.args[2], uc.args[3]);
+		break;
+	default:
+		TEST_FAIL("Expected L2 to ask for %d, got unexpected ucall %lu", vector, uc.cmd);
+	}
+}
+
+static void queue_ss_exception(struct kvm_vcpu *vcpu, bool inject)
+{
+	struct kvm_vcpu_events events;
+
+	vcpu_events_get(vcpu, &events);
+
+	TEST_ASSERT(!events.exception.pending,
+		    "Vector %d unexpectedlt pending", events.exception.nr);
+	TEST_ASSERT(!events.exception.injected,
+		    "Vector %d unexpectedly injected", events.exception.nr);
+
+	events.flags = KVM_VCPUEVENT_VALID_PAYLOAD;
+	events.exception.pending = !inject;
+	events.exception.injected = inject;
+	events.exception.nr = SS_VECTOR;
+	events.exception.has_error_code = true;
+	events.exception.error_code = SS_ERROR_CODE;
+	vcpu_events_set(vcpu, &events);
+}
+
+/*
+ * Verify KVM_{G,S}ET_EVENTS play nice with pending vs. injected exceptions
+ * when an exception is being queued for L2.  Specifically, verify that KVM
+ * honors L1 exception intercept controls when a #SS is pending/injected,
+ * triggers a #GP on vectoring the #SS, morphs to #DF if #GP isn't intercepted
+ * by L1, and finally causes (nested) SHUTDOWN if #DF isn't intercepted by L1.
+ */
+int main(int argc, char *argv[])
+{
+	vm_vaddr_t nested_test_data_gva;
+	struct kvm_vcpu_events events;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_EXCEPTION_PAYLOAD));
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM) || kvm_cpu_has(X86_FEATURE_VMX));
+
+	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
+	vm_enable_cap(vm, KVM_CAP_EXCEPTION_PAYLOAD, -2ul);
+
+	if (kvm_cpu_has(X86_FEATURE_SVM))
+		vcpu_alloc_svm(vm, &nested_test_data_gva);
+	else
+		vcpu_alloc_vmx(vm, &nested_test_data_gva);
+
+	vcpu_args_set(vcpu, 1, nested_test_data_gva);
+
+	/* Run L1 => L2.  L2 should sync and request #SS. */
+	vcpu_run(vcpu);
+	assert_ucall_vector(vcpu, SS_VECTOR);
+
+	/* Pend #SS and request immediate exit.  #SS should still be pending. */
+	queue_ss_exception(vcpu, false);
+	vcpu->run->immediate_exit = true;
+	vcpu_run_complete_io(vcpu);
+
+	/* Verify the pending events comes back out the same as it went in. */
+	vcpu_events_get(vcpu, &events);
+	ASSERT_EQ(events.flags & KVM_VCPUEVENT_VALID_PAYLOAD,
+		  KVM_VCPUEVENT_VALID_PAYLOAD);
+	ASSERT_EQ(events.exception.pending, true);
+	ASSERT_EQ(events.exception.nr, SS_VECTOR);
+	ASSERT_EQ(events.exception.has_error_code, true);
+	ASSERT_EQ(events.exception.error_code, SS_ERROR_CODE);
+
+	/*
+	 * Run for real with the pending #SS, L1 should get a VM-Exit due to
+	 * #SS interception and re-enter L2 to request #GP (via injected #SS).
+	 */
+	vcpu->run->immediate_exit = false;
+	vcpu_run(vcpu);
+	assert_ucall_vector(vcpu, GP_VECTOR);
+
+	/*
+	 * Inject #SS, the #SS should bypass interception and cause #GP, which
+	 * L1 should intercept before KVM morphs it to #DF.  L1 should then
+	 * disable #GP interception and run L2 to request #DF (via #SS => #GP).
+	 */
+	queue_ss_exception(vcpu, true);
+	vcpu_run(vcpu);
+	assert_ucall_vector(vcpu, DF_VECTOR);
+
+	/*
+	 * Inject #SS, the #SS should bypass interception and cause #GP, which
+	 * L1 is no longer interception, and so should see a #DF VM-Exit.  L1
+	 * should then signal that is done.
+	 */
+	queue_ss_exception(vcpu, true);
+	vcpu_run(vcpu);
+	assert_ucall_vector(vcpu, FAKE_TRIPLE_FAULT_VECTOR);
+
+	/*
+	 * Inject #SS yet again.  L1 is not intercepting #GP or #DF, and so
+	 * should see nested TRIPLE_FAULT / SHUTDOWN.
+	 */
+	queue_ss_exception(vcpu, true);
+	vcpu_run(vcpu);
+	assert_ucall_vector(vcpu, -1);
+
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c
index cc6421716400..59ffe7fd354f 100644
--- a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c
+++ b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c
@@ -112,19 +112,13 @@ void run_test(int reclaim_period_ms, bool disable_nx_huge_pages,
 {
 	struct kvm_vcpu *vcpu;
 	struct kvm_vm *vm;
+	uint64_t nr_bytes;
 	void *hva;
 	int r;
 
 	vm = vm_create(1);
 
 	if (disable_nx_huge_pages) {
-		/*
-		 * Cannot run the test without NX huge pages if the kernel
-		 * does not support it.
-		 */
-		if (!kvm_check_cap(KVM_CAP_VM_DISABLE_NX_HUGE_PAGES))
-			return;
-
 		r = __vm_disable_nx_huge_pages(vm);
 		if (reboot_permissions) {
 			TEST_ASSERT(!r, "Disabling NX huge pages should succeed if process has reboot permissions");
@@ -141,10 +135,24 @@ void run_test(int reclaim_period_ms, bool disable_nx_huge_pages,
 				    HPAGE_GPA, HPAGE_SLOT,
 				    HPAGE_SLOT_NPAGES, 0);
 
-	virt_map(vm, HPAGE_GVA, HPAGE_GPA, HPAGE_SLOT_NPAGES);
+	nr_bytes = HPAGE_SLOT_NPAGES * vm->page_size;
+
+	/*
+	 * Ensure that KVM can map HPAGE_SLOT with huge pages by mapping the
+	 * region into the guest with 2MiB pages whenever TDP is disabled (i.e.
+	 * whenever KVM is shadowing the guest page tables).
+	 *
+	 * When TDP is enabled, KVM should be able to map HPAGE_SLOT with huge
+	 * pages irrespective of the guest page size, so map with 4KiB pages
+	 * to test that that is the case.
+	 */
+	if (kvm_is_tdp_enabled())
+		virt_map_level(vm, HPAGE_GVA, HPAGE_GPA, nr_bytes, PG_LEVEL_4K);
+	else
+		virt_map_level(vm, HPAGE_GVA, HPAGE_GPA, nr_bytes, PG_LEVEL_2M);
 
 	hva = addr_gpa2hva(vm, HPAGE_GPA);
-	memset(hva, RETURN_OPCODE, HPAGE_SLOT_NPAGES * PAGE_SIZE);
+	memset(hva, RETURN_OPCODE, nr_bytes);
 
 	check_2m_page_count(vm, 0);
 	check_split_count(vm, 0);
@@ -248,18 +256,13 @@ int main(int argc, char **argv)
 		}
 	}
 
-	if (token != MAGIC_TOKEN) {
-		print_skip("This test must be run with the magic token %d.\n"
-			   "This is done by nx_huge_pages_test.sh, which\n"
-			   "also handles environment setup for the test.",
-			   MAGIC_TOKEN);
-		exit(KSFT_SKIP);
-	}
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_VM_DISABLE_NX_HUGE_PAGES));
+	TEST_REQUIRE(reclaim_period_ms > 0);
 
-	if (!reclaim_period_ms) {
-		print_skip("The NX reclaim period must be specified and non-zero");
-		exit(KSFT_SKIP);
-	}
+	__TEST_REQUIRE(token == MAGIC_TOKEN,
+		       "This test must be run with the magic token %d.\n"
+		       "This is done by nx_huge_pages_test.sh, which\n"
+		       "also handles environment setup for the test.");
 
 	run_test(reclaim_period_ms, false, reboot_permissions);
 	run_test(reclaim_period_ms, true, reboot_permissions);
diff --git a/tools/testing/selftests/livepatch/Makefile b/tools/testing/selftests/livepatch/Makefile
index 1acc9e1fa3fb..02fadc9d55e0 100644
--- a/tools/testing/selftests/livepatch/Makefile
+++ b/tools/testing/selftests/livepatch/Makefile
@@ -6,7 +6,8 @@ TEST_PROGS := \
 	test-callbacks.sh \
 	test-shadow-vars.sh \
 	test-state.sh \
-	test-ftrace.sh
+	test-ftrace.sh \
+	test-sysfs.sh
 
 TEST_FILES := settings
 
diff --git a/tools/testing/selftests/livepatch/functions.sh b/tools/testing/selftests/livepatch/functions.sh
index 9230b869371d..c8416c54b463 100644
--- a/tools/testing/selftests/livepatch/functions.sh
+++ b/tools/testing/selftests/livepatch/functions.sh
@@ -6,6 +6,7 @@
 
 MAX_RETRIES=600
 RETRY_INTERVAL=".1"	# seconds
+KLP_SYSFS_DIR="/sys/kernel/livepatch"
 
 # Kselftest framework requirement - SKIP code is 4
 ksft_skip=4
@@ -86,7 +87,7 @@ function set_ftrace_enabled() {
 
 	if [[ "$result" != "$1" ]] ; then
 		if [[ $can_fail -eq 1 ]] ; then
-			echo "livepatch: $err" > /dev/kmsg
+			echo "livepatch: $err" | sed 's#/proc/sys/kernel/#kernel.#' > /dev/kmsg
 			return
 		fi
 
@@ -308,3 +309,36 @@ function check_result {
 
 	cleanup_dmesg_file
 }
+
+# check_sysfs_rights(modname, rel_path, expected_rights) - check sysfs
+# path permissions
+#	modname - livepatch module creating the sysfs interface
+#	rel_path - relative path of the sysfs interface
+#	expected_rights - expected access rights
+function check_sysfs_rights() {
+	local mod="$1"; shift
+	local rel_path="$1"; shift
+	local expected_rights="$1"; shift
+
+	local path="$KLP_SYSFS_DIR/$mod/$rel_path"
+	local rights=$(/bin/stat --format '%A' "$path")
+	if test "$rights" != "$expected_rights" ; then
+		die "Unexpected access rights of $path: $expected_rights vs. $rights"
+	fi
+}
+
+# check_sysfs_value(modname, rel_path, expected_value) - check sysfs value
+#	modname - livepatch module creating the sysfs interface
+#	rel_path - relative path of the sysfs interface
+#	expected_value - expected value read from the file
+function check_sysfs_value() {
+	local mod="$1"; shift
+	local rel_path="$1"; shift
+	local expected_value="$1"; shift
+
+	local path="$KLP_SYSFS_DIR/$mod/$rel_path"
+	local value=`cat $path`
+	if test "$value" != "$expected_value" ; then
+		die "Unexpected value in $path: $expected_value vs. $value"
+	fi
+}
diff --git a/tools/testing/selftests/livepatch/test-sysfs.sh b/tools/testing/selftests/livepatch/test-sysfs.sh
new file mode 100755
index 000000000000..7f76f280189a
--- /dev/null
+++ b/tools/testing/selftests/livepatch/test-sysfs.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2022 Song Liu <song@kernel.org>
+
+. $(dirname $0)/functions.sh
+
+MOD_LIVEPATCH=test_klp_livepatch
+
+setup_config
+
+# - load a livepatch and verifies the sysfs entries work as expected
+
+start_test "sysfs test"
+
+load_lp $MOD_LIVEPATCH
+
+check_sysfs_rights "$MOD_LIVEPATCH" "" "drwxr-xr-x"
+check_sysfs_rights "$MOD_LIVEPATCH" "enabled" "-rw-r--r--"
+check_sysfs_value  "$MOD_LIVEPATCH" "enabled" "1"
+check_sysfs_rights "$MOD_LIVEPATCH" "force" "--w-------"
+check_sysfs_rights "$MOD_LIVEPATCH" "transition" "-r--r--r--"
+check_sysfs_value  "$MOD_LIVEPATCH" "transition" "0"
+check_sysfs_rights "$MOD_LIVEPATCH" "vmlinux/patched" "-r--r--r--"
+check_sysfs_value  "$MOD_LIVEPATCH" "vmlinux/patched" "1"
+
+disable_lp $MOD_LIVEPATCH
+
+unload_lp $MOD_LIVEPATCH
+
+check_result "% modprobe $MOD_LIVEPATCH
+livepatch: enabling patch '$MOD_LIVEPATCH'
+livepatch: '$MOD_LIVEPATCH': initializing patching transition
+livepatch: '$MOD_LIVEPATCH': starting patching transition
+livepatch: '$MOD_LIVEPATCH': completing patching transition
+livepatch: '$MOD_LIVEPATCH': patching complete
+% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
+livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
+livepatch: '$MOD_LIVEPATCH': starting unpatching transition
+livepatch: '$MOD_LIVEPATCH': completing unpatching transition
+livepatch: '$MOD_LIVEPATCH': unpatching complete
+% rmmod $MOD_LIVEPATCH"
+
+start_test "sysfs test object/patched"
+
+MOD_LIVEPATCH=test_klp_callbacks_demo
+MOD_TARGET=test_klp_callbacks_mod
+load_lp $MOD_LIVEPATCH
+
+# check the "patch" file changes as target module loads/unloads
+check_sysfs_value  "$MOD_LIVEPATCH" "$MOD_TARGET/patched" "0"
+load_mod $MOD_TARGET
+check_sysfs_value  "$MOD_LIVEPATCH" "$MOD_TARGET/patched" "1"
+unload_mod $MOD_TARGET
+check_sysfs_value  "$MOD_LIVEPATCH" "$MOD_TARGET/patched" "0"
+
+disable_lp $MOD_LIVEPATCH
+unload_lp $MOD_LIVEPATCH
+
+check_result "% modprobe test_klp_callbacks_demo
+livepatch: enabling patch 'test_klp_callbacks_demo'
+livepatch: 'test_klp_callbacks_demo': initializing patching transition
+test_klp_callbacks_demo: pre_patch_callback: vmlinux
+livepatch: 'test_klp_callbacks_demo': starting patching transition
+livepatch: 'test_klp_callbacks_demo': completing patching transition
+test_klp_callbacks_demo: post_patch_callback: vmlinux
+livepatch: 'test_klp_callbacks_demo': patching complete
+% modprobe test_klp_callbacks_mod
+livepatch: applying patch 'test_klp_callbacks_demo' to loading module 'test_klp_callbacks_mod'
+test_klp_callbacks_demo: pre_patch_callback: test_klp_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init
+test_klp_callbacks_demo: post_patch_callback: test_klp_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init
+test_klp_callbacks_mod: test_klp_callbacks_mod_init
+% rmmod test_klp_callbacks_mod
+test_klp_callbacks_mod: test_klp_callbacks_mod_exit
+test_klp_callbacks_demo: pre_unpatch_callback: test_klp_callbacks_mod -> [MODULE_STATE_GOING] Going away
+livepatch: reverting patch 'test_klp_callbacks_demo' on unloading module 'test_klp_callbacks_mod'
+test_klp_callbacks_demo: post_unpatch_callback: test_klp_callbacks_mod -> [MODULE_STATE_GOING] Going away
+% echo 0 > /sys/kernel/livepatch/test_klp_callbacks_demo/enabled
+livepatch: 'test_klp_callbacks_demo': initializing unpatching transition
+test_klp_callbacks_demo: pre_unpatch_callback: vmlinux
+livepatch: 'test_klp_callbacks_demo': starting unpatching transition
+livepatch: 'test_klp_callbacks_demo': completing unpatching transition
+test_klp_callbacks_demo: post_unpatch_callback: vmlinux
+livepatch: 'test_klp_callbacks_demo': unpatching complete
+% rmmod test_klp_callbacks_demo"
+
+exit 0
diff --git a/tools/testing/selftests/memory-hotplug/mem-on-off-test.sh b/tools/testing/selftests/memory-hotplug/mem-on-off-test.sh
index 46a97f318f58..74ee5067a8ce 100755
--- a/tools/testing/selftests/memory-hotplug/mem-on-off-test.sh
+++ b/tools/testing/selftests/memory-hotplug/mem-on-off-test.sh
@@ -134,6 +134,16 @@ offline_memory_expect_fail()
 	return 0
 }
 
+online_all_offline_memory()
+{
+	for memory in `hotpluggable_offline_memory`; do
+		if ! online_memory_expect_success $memory; then
+			echo "$FUNCNAME $memory: unexpected fail" >&2
+			retval=1
+		fi
+	done
+}
+
 error=-12
 priority=0
 # Run with default of ratio=2 for Kselftest run
@@ -197,8 +207,11 @@ echo -e "\t trying to offline $target out of $hotpluggable_num memory block(s):"
 for memory in `hotpluggable_online_memory`; do
 	if [ "$target" -gt 0 ]; then
 		echo "online->offline memory$memory"
-		if offline_memory_expect_success $memory; then
+		if offline_memory_expect_success $memory &>/dev/null; then
 			target=$(($target - 1))
+			echo "-> Success"
+		else
+			echo "-> Failure"
 		fi
 	fi
 done
@@ -257,7 +270,7 @@ prerequisite_extra
 echo 0 > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_OFFLINE/error
 for memory in `hotpluggable_online_memory`; do
 	if [ $((RANDOM % 100)) -lt $ratio ]; then
-		offline_memory_expect_success $memory
+		offline_memory_expect_success $memory &>/dev/null
 	fi
 done
 
@@ -266,16 +279,16 @@ done
 #
 echo $error > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_ONLINE/error
 for memory in `hotpluggable_offline_memory`; do
-	online_memory_expect_fail $memory
+	if ! online_memory_expect_fail $memory; then
+		retval=1
+	fi
 done
 
 #
 # Online all hot-pluggable memory
 #
 echo 0 > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_ONLINE/error
-for memory in `hotpluggable_offline_memory`; do
-	online_memory_expect_success $memory
-done
+online_all_offline_memory
 
 #
 # Test memory hot-remove error handling (online => offline)
@@ -283,11 +296,18 @@ done
 echo $error > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_OFFLINE/error
 for memory in `hotpluggable_online_memory`; do
 	if [ $((RANDOM % 100)) -lt $ratio ]; then
-		offline_memory_expect_fail $memory
+		if ! offline_memory_expect_fail $memory; then
+			retval=1
+		fi
 	fi
 done
 
 echo 0 > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_OFFLINE/error
 /sbin/modprobe -q -r memory-notifier-error-inject
 
+#
+# Restore memory before exit
+#
+online_all_offline_memory
+
 exit $retval
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 2a6b0bc648c4..69c58362c0ed 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -70,6 +70,7 @@ TEST_PROGS += io_uring_zerocopy_tx.sh
 TEST_GEN_FILES += bind_bhash
 TEST_GEN_PROGS += sk_bind_sendto_listen
 TEST_GEN_PROGS += sk_connect_zero_addr
+TEST_PROGS += test_ingress_egress_chaining.sh
 
 TEST_FILES := settings
 
diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh
index d5a0dd548989..ee5e98204d3d 100755
--- a/tools/testing/selftests/net/fib_nexthops.sh
+++ b/tools/testing/selftests/net/fib_nexthops.sh
@@ -1223,6 +1223,11 @@ ipv4_fcnal()
 	log_test $rc 0 "Delete nexthop route warning"
 	run_cmd "$IP route delete 172.16.101.1/32 nhid 12"
 	run_cmd "$IP nexthop del id 12"
+
+	run_cmd "$IP nexthop add id 21 via 172.16.1.6 dev veth1"
+	run_cmd "$IP ro add 172.16.101.0/24 nhid 21"
+	run_cmd "$IP ro del 172.16.101.0/24 nexthop via 172.16.1.7 dev veth1 nexthop via 172.16.1.8 dev veth1"
+	log_test $? 2 "Delete multipath route with only nh id based entry"
 }
 
 ipv4_grp_fcnal()
diff --git a/tools/testing/selftests/net/test_ingress_egress_chaining.sh b/tools/testing/selftests/net/test_ingress_egress_chaining.sh
new file mode 100644
index 000000000000..08adff6bb3b6
--- /dev/null
+++ b/tools/testing/selftests/net/test_ingress_egress_chaining.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test runs a simple ingress tc setup between two veth pairs,
+# and chains a single egress rule to test ingress chaining to egress.
+#
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+if [ "$(id -u)" -ne 0 ];then
+	echo "SKIP: Need root privileges"
+	exit $ksft_skip
+fi
+
+needed_mods="act_mirred cls_flower sch_ingress"
+for mod in $needed_mods; do
+	modinfo $mod &>/dev/null || { echo "SKIP: Need act_mirred module"; exit $ksft_skip; }
+done
+
+ns="ns$((RANDOM%899+100))"
+veth1="veth1$((RANDOM%899+100))"
+veth2="veth2$((RANDOM%899+100))"
+peer1="peer1$((RANDOM%899+100))"
+peer2="peer2$((RANDOM%899+100))"
+ip_peer1=198.51.100.5
+ip_peer2=198.51.100.6
+
+function fail() {
+	echo "FAIL: $@" >> /dev/stderr
+	exit 1
+}
+
+function cleanup() {
+	killall -q -9 udpgso_bench_rx
+	ip link del $veth1 &> /dev/null
+	ip link del $veth2 &> /dev/null
+	ip netns del $ns &> /dev/null
+}
+trap cleanup EXIT
+
+function config() {
+	echo "Setup veth pairs [$veth1, $peer1], and veth pair [$veth2, $peer2]"
+	ip link add $veth1 type veth peer name $peer1
+	ip link add $veth2 type veth peer name $peer2
+	ip addr add $ip_peer1/24 dev $peer1
+	ip link set $peer1 up
+	ip netns add $ns
+	ip link set dev $peer2 netns $ns
+	ip netns exec $ns ip addr add $ip_peer2/24 dev $peer2
+	ip netns exec $ns ip link set $peer2 up
+	ip link set $veth1 up
+	ip link set $veth2 up
+
+	echo "Add tc filter ingress->egress forwarding $veth1 <-> $veth2"
+	tc qdisc add dev $veth2 ingress
+	tc qdisc add dev $veth1 ingress
+	tc filter add dev $veth2 ingress prio 1 proto all flower \
+		action mirred egress redirect dev $veth1
+	tc filter add dev $veth1 ingress prio 1 proto all flower \
+		action mirred egress redirect dev $veth2
+
+	echo "Add tc filter egress->ingress forwarding $peer1 -> $veth1, bypassing the veth pipe"
+	tc qdisc add dev $peer1 clsact
+	tc filter add dev $peer1 egress prio 20 proto ip flower \
+		action mirred ingress redirect dev $veth1
+}
+
+function test_run() {
+	echo "Run tcp traffic"
+	./udpgso_bench_rx -t &
+	sleep 1
+	ip netns exec $ns timeout -k 2 10 ./udpgso_bench_tx -t -l 2 -4 -D $ip_peer1 || fail "traffic failed"
+	echo "Test passed"
+}
+
+config
+test_run
+trap - EXIT
+cleanup
diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile
index 600e3a19d5e2..4504ee07be08 100644
--- a/tools/testing/selftests/netfilter/Makefile
+++ b/tools/testing/selftests/netfilter/Makefile
@@ -6,7 +6,7 @@ TEST_PROGS := nft_trans_stress.sh nft_fib.sh nft_nat.sh bridge_brouter.sh \
 	nft_concat_range.sh nft_conntrack_helper.sh \
 	nft_queue.sh nft_meta.sh nf_nat_edemux.sh \
 	ipip-conntrack-mtu.sh conntrack_tcp_unreplied.sh \
-	conntrack_vrf.sh nft_synproxy.sh
+	conntrack_vrf.sh nft_synproxy.sh rpath.sh
 
 CFLAGS += $(shell pkg-config --cflags libmnl 2>/dev/null || echo "-I/usr/include/libmnl")
 LDLIBS = -lmnl
diff --git a/tools/testing/selftests/netfilter/nft_fib.sh b/tools/testing/selftests/netfilter/nft_fib.sh
index fd76b69635a4..dff476e45e77 100755
--- a/tools/testing/selftests/netfilter/nft_fib.sh
+++ b/tools/testing/selftests/netfilter/nft_fib.sh
@@ -188,6 +188,7 @@ test_ping() {
 ip netns exec ${nsrouter} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
 ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
 ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
+ip netns exec ${nsrouter} sysctl net.ipv4.conf.all.rp_filter=0 > /dev/null
 ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.rp_filter=0 > /dev/null
 
 sleep 3
diff --git a/tools/testing/selftests/netfilter/rpath.sh b/tools/testing/selftests/netfilter/rpath.sh
new file mode 100755
index 000000000000..2d8da7bd8ab7
--- /dev/null
+++ b/tools/testing/selftests/netfilter/rpath.sh
@@ -0,0 +1,147 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# return code to signal skipped test
+ksft_skip=4
+
+# search for legacy iptables (it uses the xtables extensions
+if iptables-legacy --version >/dev/null 2>&1; then
+	iptables='iptables-legacy'
+elif iptables --version >/dev/null 2>&1; then
+	iptables='iptables'
+else
+	iptables=''
+fi
+
+if ip6tables-legacy --version >/dev/null 2>&1; then
+	ip6tables='ip6tables-legacy'
+elif ! ip6tables --version >/dev/null 2>&1; then
+	ip6tables='ip6tables'
+else
+	ip6tables=''
+fi
+
+if nft --version >/dev/null 2>&1; then
+	nft='nft'
+else
+	nft=''
+fi
+
+if [ -z "$iptables$ip6tables$nft" ]; then
+	echo "SKIP: Test needs iptables, ip6tables or nft"
+	exit $ksft_skip
+fi
+
+sfx=$(mktemp -u "XXXXXXXX")
+ns1="ns1-$sfx"
+ns2="ns2-$sfx"
+trap "ip netns del $ns1; ip netns del $ns2" EXIT
+
+# create two netns, disable rp_filter in ns2 and
+# keep IPv6 address when moving into VRF
+ip netns add "$ns1"
+ip netns add "$ns2"
+ip netns exec "$ns2" sysctl -q net.ipv4.conf.all.rp_filter=0
+ip netns exec "$ns2" sysctl -q net.ipv4.conf.default.rp_filter=0
+ip netns exec "$ns2" sysctl -q net.ipv6.conf.all.keep_addr_on_down=1
+
+# a standard connection between the netns, should not trigger rp filter
+ip -net "$ns1" link add v0 type veth peer name v0 netns "$ns2"
+ip -net "$ns1" link set v0 up; ip -net "$ns2" link set v0 up
+ip -net "$ns1" a a 192.168.23.2/24 dev v0
+ip -net "$ns2" a a 192.168.23.1/24 dev v0
+ip -net "$ns1" a a fec0:23::2/64 dev v0 nodad
+ip -net "$ns2" a a fec0:23::1/64 dev v0 nodad
+
+# rp filter testing: ns1 sends packets via v0 which ns2 would route back via d0
+ip -net "$ns2" link add d0 type dummy
+ip -net "$ns2" link set d0 up
+ip -net "$ns1" a a 192.168.42.2/24 dev v0
+ip -net "$ns2" a a 192.168.42.1/24 dev d0
+ip -net "$ns1" a a fec0:42::2/64 dev v0 nodad
+ip -net "$ns2" a a fec0:42::1/64 dev d0 nodad
+
+# firewall matches to test
+ip netns exec "$ns2" "$iptables" -t raw -A PREROUTING -s 192.168.0.0/16 -m rpfilter
+ip netns exec "$ns2" "$ip6tables" -t raw -A PREROUTING -s fec0::/16 -m rpfilter
+ip netns exec "$ns2" nft -f - <<EOF
+table inet t {
+	chain c {
+		type filter hook prerouting priority raw;
+		ip saddr 192.168.0.0/16 fib saddr . iif oif exists counter
+		ip6 saddr fec0::/16 fib saddr . iif oif exists counter
+	}
+}
+EOF
+
+die() {
+	echo "FAIL: $*"
+	#ip netns exec "$ns2" "$iptables" -t raw -vS
+	#ip netns exec "$ns2" "$ip6tables" -t raw -vS
+	#ip netns exec "$ns2" nft list ruleset
+	exit 1
+}
+
+# check rule counters, return true if rule did not match
+ipt_zero_rule() { # (command)
+	[ -n "$1" ] || return 0
+	ip netns exec "$ns2" "$1" -t raw -vS | grep -q -- "-m rpfilter -c 0 0"
+}
+nft_zero_rule() { # (family)
+	[ -n "$nft" ] || return 0
+	ip netns exec "$ns2" "$nft" list chain inet t c | \
+		grep -q "$1 saddr .* counter packets 0 bytes 0"
+}
+
+netns_ping() { # (netns, args...)
+	local netns="$1"
+	shift
+	ip netns exec "$netns" ping -q -c 1 -W 1 "$@" >/dev/null
+}
+
+testrun() {
+	# clear counters first
+	[ -n "$iptables" ] && ip netns exec "$ns2" "$iptables" -t raw -Z
+	[ -n "$ip6tables" ] && ip netns exec "$ns2" "$ip6tables" -t raw -Z
+	if [ -n "$nft" ]; then
+		(
+			echo "delete table inet t";
+			ip netns exec "$ns2" nft -s list table inet t;
+		) | ip netns exec "$ns2" nft -f -
+	fi
+
+	# test 1: martian traffic should fail rpfilter matches
+	netns_ping "$ns1" -I v0 192.168.42.1 && \
+		die "martian ping 192.168.42.1 succeeded"
+	netns_ping "$ns1" -I v0 fec0:42::1 && \
+		die "martian ping fec0:42::1 succeeded"
+
+	ipt_zero_rule "$iptables" || die "iptables matched martian"
+	ipt_zero_rule "$ip6tables" || die "ip6tables matched martian"
+	nft_zero_rule ip || die "nft IPv4 matched martian"
+	nft_zero_rule ip6 || die "nft IPv6 matched martian"
+
+	# test 2: rpfilter match should pass for regular traffic
+	netns_ping "$ns1" 192.168.23.1 || \
+		die "regular ping 192.168.23.1 failed"
+	netns_ping "$ns1" fec0:23::1 || \
+		die "regular ping fec0:23::1 failed"
+
+	ipt_zero_rule "$iptables" && die "iptables match not effective"
+	ipt_zero_rule "$ip6tables" && die "ip6tables match not effective"
+	nft_zero_rule ip && die "nft IPv4 match not effective"
+	nft_zero_rule ip6 && die "nft IPv6 match not effective"
+
+}
+
+testrun
+
+# repeat test with vrf device in $ns2
+ip -net "$ns2" link add vrf0 type vrf table 10
+ip -net "$ns2" link set vrf0 up
+ip -net "$ns2" link set v0 master vrf0
+
+testrun
+
+echo "PASS: netfilter reverse path match works as intended"
+exit 0
diff --git a/tools/testing/selftests/powerpc/benchmarks/gettimeofday.c b/tools/testing/selftests/powerpc/benchmarks/gettimeofday.c
index 6b415683357b..580fcac0a09f 100644
--- a/tools/testing/selftests/powerpc/benchmarks/gettimeofday.c
+++ b/tools/testing/selftests/powerpc/benchmarks/gettimeofday.c
@@ -12,7 +12,7 @@ static int test_gettimeofday(void)
 {
 	int i;
 
-	struct timeval tv_start, tv_end;
+	struct timeval tv_start, tv_end, tv_diff;
 
 	gettimeofday(&tv_start, NULL);
 
@@ -20,7 +20,9 @@ static int test_gettimeofday(void)
 		gettimeofday(&tv_end, NULL);
 	}
 
-	printf("time = %.6f\n", tv_end.tv_sec - tv_start.tv_sec + (tv_end.tv_usec - tv_start.tv_usec) * 1e-6);
+	timersub(&tv_start, &tv_end, &tv_diff);
+
+	printf("time = %.6f\n", tv_diff.tv_sec + (tv_diff.tv_usec) * 1e-6);
 
 	return 0;
 }
diff --git a/tools/testing/selftests/powerpc/mm/Makefile b/tools/testing/selftests/powerpc/mm/Makefile
index 27dc09d0bfee..19dd0b2ea397 100644
--- a/tools/testing/selftests/powerpc/mm/Makefile
+++ b/tools/testing/selftests/powerpc/mm/Makefile
@@ -3,7 +3,7 @@ noarg:
 	$(MAKE) -C ../
 
 TEST_GEN_PROGS := hugetlb_vs_thp_test subpage_prot prot_sao segv_errors wild_bctr \
-		  large_vm_fork_separation bad_accesses pkey_exec_prot \
+		  large_vm_fork_separation bad_accesses exec_prot pkey_exec_prot \
 		  pkey_siginfo stack_expansion_signal stack_expansion_ldst \
 		  large_vm_gpr_corruption
 TEST_PROGS := stress_code_patching.sh
@@ -22,6 +22,7 @@ $(OUTPUT)/wild_bctr: CFLAGS += -m64
 $(OUTPUT)/large_vm_fork_separation: CFLAGS += -m64
 $(OUTPUT)/large_vm_gpr_corruption: CFLAGS += -m64
 $(OUTPUT)/bad_accesses: CFLAGS += -m64
+$(OUTPUT)/exec_prot: CFLAGS += -m64
 $(OUTPUT)/pkey_exec_prot: CFLAGS += -m64
 $(OUTPUT)/pkey_siginfo: CFLAGS += -m64
 
diff --git a/tools/testing/selftests/powerpc/mm/exec_prot.c b/tools/testing/selftests/powerpc/mm/exec_prot.c
new file mode 100644
index 000000000000..db75b2225de1
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mm/exec_prot.c
@@ -0,0 +1,231 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2022, Nicholas Miehlbradt, IBM Corporation
+ * based on pkey_exec_prot.c
+ *
+ * Test if applying execute protection on pages works as expected.
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <signal.h>
+
+#include <unistd.h>
+#include <sys/mman.h>
+
+#include "pkeys.h"
+
+
+#define PPC_INST_NOP	0x60000000
+#define PPC_INST_TRAP	0x7fe00008
+#define PPC_INST_BLR	0x4e800020
+
+static volatile sig_atomic_t fault_code;
+static volatile sig_atomic_t remaining_faults;
+static volatile unsigned int *fault_addr;
+static unsigned long pgsize, numinsns;
+static unsigned int *insns;
+static bool pkeys_supported;
+
+static bool is_fault_expected(int fault_code)
+{
+	if (fault_code == SEGV_ACCERR)
+		return true;
+
+	/* Assume any pkey error is fine since pkey_exec_prot test covers them */
+	if (fault_code == SEGV_PKUERR && pkeys_supported)
+		return true;
+
+	return false;
+}
+
+static void trap_handler(int signum, siginfo_t *sinfo, void *ctx)
+{
+	/* Check if this fault originated from the expected address */
+	if (sinfo->si_addr != (void *)fault_addr)
+		sigsafe_err("got a fault for an unexpected address\n");
+
+	_exit(1);
+}
+
+static void segv_handler(int signum, siginfo_t *sinfo, void *ctx)
+{
+	fault_code = sinfo->si_code;
+
+	/* Check if this fault originated from the expected address */
+	if (sinfo->si_addr != (void *)fault_addr) {
+		sigsafe_err("got a fault for an unexpected address\n");
+		_exit(1);
+	}
+
+	/* Check if too many faults have occurred for a single test case */
+	if (!remaining_faults) {
+		sigsafe_err("got too many faults for the same address\n");
+		_exit(1);
+	}
+
+
+	/* Restore permissions in order to continue */
+	if (is_fault_expected(fault_code)) {
+		if (mprotect(insns, pgsize, PROT_READ | PROT_WRITE | PROT_EXEC)) {
+			sigsafe_err("failed to set access permissions\n");
+			_exit(1);
+		}
+	} else {
+		sigsafe_err("got a fault with an unexpected code\n");
+		_exit(1);
+	}
+
+	remaining_faults--;
+}
+
+static int check_exec_fault(int rights)
+{
+	/*
+	 * Jump to the executable region.
+	 *
+	 * The first iteration also checks if the overwrite of the
+	 * first instruction word from a trap to a no-op succeeded.
+	 */
+	fault_code = -1;
+	remaining_faults = 0;
+	if (!(rights & PROT_EXEC))
+		remaining_faults = 1;
+
+	FAIL_IF(mprotect(insns, pgsize, rights) != 0);
+	asm volatile("mtctr	%0; bctrl" : : "r"(insns));
+
+	FAIL_IF(remaining_faults != 0);
+	if (!(rights & PROT_EXEC))
+		FAIL_IF(!is_fault_expected(fault_code));
+
+	return 0;
+}
+
+static int test(void)
+{
+	struct sigaction segv_act, trap_act;
+	int i;
+
+	/* Skip the test if the CPU doesn't support Radix */
+	SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_3_00));
+
+	/* Check if pkeys are supported */
+	pkeys_supported = pkeys_unsupported() == 0;
+
+	/* Setup SIGSEGV handler */
+	segv_act.sa_handler = 0;
+	segv_act.sa_sigaction = segv_handler;
+	FAIL_IF(sigprocmask(SIG_SETMASK, 0, &segv_act.sa_mask) != 0);
+	segv_act.sa_flags = SA_SIGINFO;
+	segv_act.sa_restorer = 0;
+	FAIL_IF(sigaction(SIGSEGV, &segv_act, NULL) != 0);
+
+	/* Setup SIGTRAP handler */
+	trap_act.sa_handler = 0;
+	trap_act.sa_sigaction = trap_handler;
+	FAIL_IF(sigprocmask(SIG_SETMASK, 0, &trap_act.sa_mask) != 0);
+	trap_act.sa_flags = SA_SIGINFO;
+	trap_act.sa_restorer = 0;
+	FAIL_IF(sigaction(SIGTRAP, &trap_act, NULL) != 0);
+
+	/* Setup executable region */
+	pgsize = getpagesize();
+	numinsns = pgsize / sizeof(unsigned int);
+	insns = (unsigned int *)mmap(NULL, pgsize, PROT_READ | PROT_WRITE,
+				      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	FAIL_IF(insns == MAP_FAILED);
+
+	/* Write the instruction words */
+	for (i = 1; i < numinsns - 1; i++)
+		insns[i] = PPC_INST_NOP;
+
+	/*
+	 * Set the first instruction as an unconditional trap. If
+	 * the last write to this address succeeds, this should
+	 * get overwritten by a no-op.
+	 */
+	insns[0] = PPC_INST_TRAP;
+
+	/*
+	 * Later, to jump to the executable region, we use a branch
+	 * and link instruction (bctrl) which sets the return address
+	 * automatically in LR. Use that to return back.
+	 */
+	insns[numinsns - 1] = PPC_INST_BLR;
+
+	/*
+	 * Pick the first instruction's address from the executable
+	 * region.
+	 */
+	fault_addr = insns;
+
+	/*
+	 * Read an instruction word from the address when the page
+	 * is execute only. This should generate an access fault.
+	 */
+	fault_code = -1;
+	remaining_faults = 1;
+	printf("Testing read on --x, should fault...");
+	FAIL_IF(mprotect(insns, pgsize, PROT_EXEC) != 0);
+	i = *fault_addr;
+	FAIL_IF(remaining_faults != 0 || !is_fault_expected(fault_code));
+	printf("ok!\n");
+
+	/*
+	 * Write an instruction word to the address when the page
+	 * execute only. This should also generate an access fault.
+	 */
+	fault_code = -1;
+	remaining_faults = 1;
+	printf("Testing write on --x, should fault...");
+	FAIL_IF(mprotect(insns, pgsize, PROT_EXEC) != 0);
+	*fault_addr = PPC_INST_NOP;
+	FAIL_IF(remaining_faults != 0 || !is_fault_expected(fault_code));
+	printf("ok!\n");
+
+	printf("Testing exec on ---, should fault...");
+	FAIL_IF(check_exec_fault(PROT_NONE));
+	printf("ok!\n");
+
+	printf("Testing exec on r--, should fault...");
+	FAIL_IF(check_exec_fault(PROT_READ));
+	printf("ok!\n");
+
+	printf("Testing exec on -w-, should fault...");
+	FAIL_IF(check_exec_fault(PROT_WRITE));
+	printf("ok!\n");
+
+	printf("Testing exec on rw-, should fault...");
+	FAIL_IF(check_exec_fault(PROT_READ | PROT_WRITE));
+	printf("ok!\n");
+
+	printf("Testing exec on --x, should succeed...");
+	FAIL_IF(check_exec_fault(PROT_EXEC));
+	printf("ok!\n");
+
+	printf("Testing exec on r-x, should succeed...");
+	FAIL_IF(check_exec_fault(PROT_READ | PROT_EXEC));
+	printf("ok!\n");
+
+	printf("Testing exec on -wx, should succeed...");
+	FAIL_IF(check_exec_fault(PROT_WRITE | PROT_EXEC));
+	printf("ok!\n");
+
+	printf("Testing exec on rwx, should succeed...");
+	FAIL_IF(check_exec_fault(PROT_READ | PROT_WRITE | PROT_EXEC));
+	printf("ok!\n");
+
+	/* Cleanup */
+	FAIL_IF(munmap((void *)insns, pgsize));
+
+	return 0;
+}
+
+int main(void)
+{
+	return test_harness(test, "exec_prot");
+}
diff --git a/tools/testing/selftests/powerpc/mm/large_vm_gpr_corruption.c b/tools/testing/selftests/powerpc/mm/large_vm_gpr_corruption.c
index 927bfae99ed9..7da515f1da72 100644
--- a/tools/testing/selftests/powerpc/mm/large_vm_gpr_corruption.c
+++ b/tools/testing/selftests/powerpc/mm/large_vm_gpr_corruption.c
@@ -112,6 +112,8 @@ static int test(void)
 	// This tests a hash MMU specific bug.
 	FAIL_IF(using_hash_mmu(&hash_mmu));
 	SKIP_IF(!hash_mmu);
+	// 4K kernels don't support 4PB address space
+	SKIP_IF(sysconf(_SC_PAGESIZE) < 65536);
 
 	page_size = sysconf(_SC_PAGESIZE);
 
diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/bhrb_filter_map_test.c b/tools/testing/selftests/powerpc/pmu/sampling_tests/bhrb_filter_map_test.c
index 8182647c63c8..3f43c315c666 100644
--- a/tools/testing/selftests/powerpc/pmu/sampling_tests/bhrb_filter_map_test.c
+++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/bhrb_filter_map_test.c
@@ -96,6 +96,15 @@ static int bhrb_filter_map_test(void)
 		}
 	}
 
+	/*
+	 * Combine filter maps which includes a valid branch filter and an invalid branch
+	 * filter. Example: any ( PERF_SAMPLE_BRANCH_ANY) and any_call
+	 * (PERF_SAMPLE_BRANCH_ANY_CALL).
+	 * The perf_event_open should fail in this case.
+	 */
+	event.attr.branch_sample_type = PERF_SAMPLE_BRANCH_ANY | PERF_SAMPLE_BRANCH_ANY_CALL;
+	FAIL_IF(!event_open(&event));
+
 	return 0;
 }
 
diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore
index c4e6a34f9657..a156ac5dd2c6 100644
--- a/tools/testing/selftests/proc/.gitignore
+++ b/tools/testing/selftests/proc/.gitignore
@@ -5,6 +5,7 @@
 /proc-fsconfig-hidepid
 /proc-loadavg-001
 /proc-multiple-procfs
+/proc-empty-vm
 /proc-pid-vm
 /proc-self-map-files-001
 /proc-self-map-files-002
diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile
index 219fc6113847..cd95369254c0 100644
--- a/tools/testing/selftests/proc/Makefile
+++ b/tools/testing/selftests/proc/Makefile
@@ -8,6 +8,7 @@ TEST_GEN_PROGS += fd-001-lookup
 TEST_GEN_PROGS += fd-002-posix-eq
 TEST_GEN_PROGS += fd-003-kthread
 TEST_GEN_PROGS += proc-loadavg-001
+TEST_GEN_PROGS += proc-empty-vm
 TEST_GEN_PROGS += proc-pid-vm
 TEST_GEN_PROGS += proc-self-map-files-001
 TEST_GEN_PROGS += proc-self-map-files-002
diff --git a/tools/testing/selftests/proc/proc-empty-vm.c b/tools/testing/selftests/proc/proc-empty-vm.c
new file mode 100644
index 000000000000..d95b1cb43d9d
--- /dev/null
+++ b/tools/testing/selftests/proc/proc-empty-vm.c
@@ -0,0 +1,386 @@
+/*
+ * Copyright (c) 2022 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+/*
+ * Create a process without mappings by unmapping everything at once and
+ * holding it with ptrace(2). See what happens to
+ *
+ *	/proc/${pid}/maps
+ *	/proc/${pid}/numa_maps
+ *	/proc/${pid}/smaps
+ *	/proc/${pid}/smaps_rollup
+ */
+#undef NDEBUG
+#include <assert.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/ptrace.h>
+#include <sys/resource.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+/*
+ * 0: vsyscall VMA doesn't exist	vsyscall=none
+ * 1: vsyscall VMA is --xp		vsyscall=xonly
+ * 2: vsyscall VMA is r-xp		vsyscall=emulate
+ */
+static int g_vsyscall;
+static const char *g_proc_pid_maps_vsyscall;
+static const char *g_proc_pid_smaps_vsyscall;
+
+static const char proc_pid_maps_vsyscall_0[] = "";
+static const char proc_pid_maps_vsyscall_1[] =
+"ffffffffff600000-ffffffffff601000 --xp 00000000 00:00 0                  [vsyscall]\n";
+static const char proc_pid_maps_vsyscall_2[] =
+"ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0                  [vsyscall]\n";
+
+static const char proc_pid_smaps_vsyscall_0[] = "";
+
+static const char proc_pid_smaps_vsyscall_1[] =
+"ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0                  [vsyscall]\n"
+"Size:                  4 kB\n"
+"KernelPageSize:        4 kB\n"
+"MMUPageSize:           4 kB\n"
+"Rss:                   0 kB\n"
+"Pss:                   0 kB\n"
+"Pss_Dirty:             0 kB\n"
+"Shared_Clean:          0 kB\n"
+"Shared_Dirty:          0 kB\n"
+"Private_Clean:         0 kB\n"
+"Private_Dirty:         0 kB\n"
+"Referenced:            0 kB\n"
+"Anonymous:             0 kB\n"
+"LazyFree:              0 kB\n"
+"AnonHugePages:         0 kB\n"
+"ShmemPmdMapped:        0 kB\n"
+"FilePmdMapped:         0 kB\n"
+"Shared_Hugetlb:        0 kB\n"
+"Private_Hugetlb:       0 kB\n"
+"Swap:                  0 kB\n"
+"SwapPss:               0 kB\n"
+"Locked:                0 kB\n"
+"THPeligible:    0\n"
+/*
+ * "ProtectionKey:" field is conditional. It is possible to check it as well,
+ * but I don't have such machine.
+ */
+;
+
+static const char proc_pid_smaps_vsyscall_2[] =
+"ffffffffff600000-ffffffffff601000 --xp 00000000 00:00 0                  [vsyscall]\n"
+"Size:                  4 kB\n"
+"KernelPageSize:        4 kB\n"
+"MMUPageSize:           4 kB\n"
+"Rss:                   0 kB\n"
+"Pss:                   0 kB\n"
+"Pss_Dirty:             0 kB\n"
+"Shared_Clean:          0 kB\n"
+"Shared_Dirty:          0 kB\n"
+"Private_Clean:         0 kB\n"
+"Private_Dirty:         0 kB\n"
+"Referenced:            0 kB\n"
+"Anonymous:             0 kB\n"
+"LazyFree:              0 kB\n"
+"AnonHugePages:         0 kB\n"
+"ShmemPmdMapped:        0 kB\n"
+"FilePmdMapped:         0 kB\n"
+"Shared_Hugetlb:        0 kB\n"
+"Private_Hugetlb:       0 kB\n"
+"Swap:                  0 kB\n"
+"SwapPss:               0 kB\n"
+"Locked:                0 kB\n"
+"THPeligible:    0\n"
+/*
+ * "ProtectionKey:" field is conditional. It is possible to check it as well,
+ * but I'm too tired.
+ */
+;
+
+static void sigaction_SIGSEGV(int _, siginfo_t *__, void *___)
+{
+	_exit(EXIT_FAILURE);
+}
+
+static void sigaction_SIGSEGV_vsyscall(int _, siginfo_t *__, void *___)
+{
+	_exit(g_vsyscall);
+}
+
+/*
+ * vsyscall page can't be unmapped, probe it directly.
+ */
+static void vsyscall(void)
+{
+	pid_t pid;
+	int wstatus;
+
+	pid = fork();
+	if (pid < 0) {
+		fprintf(stderr, "fork, errno %d\n", errno);
+		exit(1);
+	}
+	if (pid == 0) {
+		setrlimit(RLIMIT_CORE, &(struct rlimit){});
+
+		/* Hide "segfault at ffffffffff600000" messages. */
+		struct sigaction act = {};
+		act.sa_flags = SA_SIGINFO;
+		act.sa_sigaction = sigaction_SIGSEGV_vsyscall;
+		sigaction(SIGSEGV, &act, NULL);
+
+		g_vsyscall = 0;
+		/* gettimeofday(NULL, NULL); */
+		asm volatile (
+			"call %P0"
+			:
+			: "i" (0xffffffffff600000), "D" (NULL), "S" (NULL)
+			: "rax", "rcx", "r11"
+		);
+
+		g_vsyscall = 1;
+		*(volatile int *)0xffffffffff600000UL;
+
+		g_vsyscall = 2;
+		exit(g_vsyscall);
+	}
+	waitpid(pid, &wstatus, 0);
+	if (WIFEXITED(wstatus)) {
+		g_vsyscall = WEXITSTATUS(wstatus);
+	} else {
+		fprintf(stderr, "error: vsyscall wstatus %08x\n", wstatus);
+		exit(1);
+	}
+}
+
+static int test_proc_pid_maps(pid_t pid)
+{
+	char buf[4096];
+	snprintf(buf, sizeof(buf), "/proc/%u/maps", pid);
+	int fd = open(buf, O_RDONLY);
+	if (fd == -1) {
+		perror("open /proc/${pid}/maps");
+		return EXIT_FAILURE;
+	} else {
+		ssize_t rv = read(fd, buf, sizeof(buf));
+		close(fd);
+		if (g_vsyscall == 0) {
+			assert(rv == 0);
+		} else {
+			size_t len = strlen(g_proc_pid_maps_vsyscall);
+			assert(rv == len);
+			assert(memcmp(buf, g_proc_pid_maps_vsyscall, len) == 0);
+		}
+		return EXIT_SUCCESS;
+	}
+}
+
+static int test_proc_pid_numa_maps(pid_t pid)
+{
+	char buf[4096];
+	snprintf(buf, sizeof(buf), "/proc/%u/numa_maps", pid);
+	int fd = open(buf, O_RDONLY);
+	if (fd == -1) {
+		if (errno == ENOENT) {
+			/*
+			 * /proc/${pid}/numa_maps is under CONFIG_NUMA,
+			 * it doesn't necessarily exist.
+			 */
+			return EXIT_SUCCESS;
+		}
+		perror("open /proc/${pid}/numa_maps");
+		return EXIT_FAILURE;
+	} else {
+		ssize_t rv = read(fd, buf, sizeof(buf));
+		close(fd);
+		assert(rv == 0);
+		return EXIT_SUCCESS;
+	}
+}
+
+static int test_proc_pid_smaps(pid_t pid)
+{
+	char buf[4096];
+	snprintf(buf, sizeof(buf), "/proc/%u/smaps", pid);
+	int fd = open(buf, O_RDONLY);
+	if (fd == -1) {
+		if (errno == ENOENT) {
+			/*
+			 * /proc/${pid}/smaps is under CONFIG_PROC_PAGE_MONITOR,
+			 * it doesn't necessarily exist.
+			 */
+			return EXIT_SUCCESS;
+		}
+		perror("open /proc/${pid}/smaps");
+		return EXIT_FAILURE;
+	} else {
+		ssize_t rv = read(fd, buf, sizeof(buf));
+		close(fd);
+		if (g_vsyscall == 0) {
+			assert(rv == 0);
+		} else {
+			size_t len = strlen(g_proc_pid_maps_vsyscall);
+			/* TODO "ProtectionKey:" */
+			assert(rv > len);
+			assert(memcmp(buf, g_proc_pid_maps_vsyscall, len) == 0);
+		}
+		return EXIT_SUCCESS;
+	}
+}
+
+static const char g_smaps_rollup[] =
+"00000000-00000000 ---p 00000000 00:00 0                                  [rollup]\n"
+"Rss:                   0 kB\n"
+"Pss:                   0 kB\n"
+"Pss_Dirty:             0 kB\n"
+"Pss_Anon:              0 kB\n"
+"Pss_File:              0 kB\n"
+"Pss_Shmem:             0 kB\n"
+"Shared_Clean:          0 kB\n"
+"Shared_Dirty:          0 kB\n"
+"Private_Clean:         0 kB\n"
+"Private_Dirty:         0 kB\n"
+"Referenced:            0 kB\n"
+"Anonymous:             0 kB\n"
+"LazyFree:              0 kB\n"
+"AnonHugePages:         0 kB\n"
+"ShmemPmdMapped:        0 kB\n"
+"FilePmdMapped:         0 kB\n"
+"Shared_Hugetlb:        0 kB\n"
+"Private_Hugetlb:       0 kB\n"
+"Swap:                  0 kB\n"
+"SwapPss:               0 kB\n"
+"Locked:                0 kB\n"
+;
+
+static int test_proc_pid_smaps_rollup(pid_t pid)
+{
+	char buf[4096];
+	snprintf(buf, sizeof(buf), "/proc/%u/smaps_rollup", pid);
+	int fd = open(buf, O_RDONLY);
+	if (fd == -1) {
+		if (errno == ENOENT) {
+			/*
+			 * /proc/${pid}/smaps_rollup is under CONFIG_PROC_PAGE_MONITOR,
+			 * it doesn't necessarily exist.
+			 */
+			return EXIT_SUCCESS;
+		}
+		perror("open /proc/${pid}/smaps_rollup");
+		return EXIT_FAILURE;
+	} else {
+		ssize_t rv = read(fd, buf, sizeof(buf));
+		close(fd);
+		assert(rv == sizeof(g_smaps_rollup) - 1);
+		assert(memcmp(buf, g_smaps_rollup, sizeof(g_smaps_rollup) - 1) == 0);
+		return EXIT_SUCCESS;
+	}
+}
+
+int main(void)
+{
+	int rv = EXIT_SUCCESS;
+
+	vsyscall();
+
+	switch (g_vsyscall) {
+	case 0:
+		g_proc_pid_maps_vsyscall  = proc_pid_maps_vsyscall_0;
+		g_proc_pid_smaps_vsyscall = proc_pid_smaps_vsyscall_0;
+		break;
+	case 1:
+		g_proc_pid_maps_vsyscall  = proc_pid_maps_vsyscall_1;
+		g_proc_pid_smaps_vsyscall = proc_pid_smaps_vsyscall_1;
+		break;
+	case 2:
+		g_proc_pid_maps_vsyscall  = proc_pid_maps_vsyscall_2;
+		g_proc_pid_smaps_vsyscall = proc_pid_smaps_vsyscall_2;
+		break;
+	default:
+		abort();
+	}
+
+	pid_t pid = fork();
+	if (pid == -1) {
+		perror("fork");
+		return EXIT_FAILURE;
+	} else if (pid == 0) {
+		rv = ptrace(PTRACE_TRACEME, 0, NULL, NULL);
+		if (rv != 0) {
+			if (errno == EPERM) {
+				fprintf(stderr,
+"Did you know? ptrace(PTRACE_TRACEME) doesn't work under strace.\n"
+				);
+				kill(getppid(), SIGTERM);
+				return EXIT_FAILURE;
+			}
+			perror("ptrace PTRACE_TRACEME");
+			return EXIT_FAILURE;
+		}
+
+		/*
+		 * Hide "segfault at ..." messages. Signal handler won't run.
+		 */
+		struct sigaction act = {};
+		act.sa_flags = SA_SIGINFO;
+		act.sa_sigaction = sigaction_SIGSEGV;
+		sigaction(SIGSEGV, &act, NULL);
+
+#ifdef __amd64__
+		munmap(NULL, ((size_t)1 << 47) - 4096);
+#else
+#error "implement 'unmap everything'"
+#endif
+		return EXIT_FAILURE;
+	} else {
+		/*
+		 * TODO find reliable way to signal parent that munmap(2) completed.
+		 * Child can't do it directly because it effectively doesn't exist
+		 * anymore. Looking at child's VM files isn't 100% reliable either:
+		 * due to a bug they may not become empty or empty-like.
+		 */
+		sleep(1);
+
+		if (rv == EXIT_SUCCESS) {
+			rv = test_proc_pid_maps(pid);
+		}
+		if (rv == EXIT_SUCCESS) {
+			rv = test_proc_pid_numa_maps(pid);
+		}
+		if (rv == EXIT_SUCCESS) {
+			rv = test_proc_pid_smaps(pid);
+		}
+		if (rv == EXIT_SUCCESS) {
+			rv = test_proc_pid_smaps_rollup(pid);
+		}
+		/*
+		 * TODO test /proc/${pid}/statm, task_statm()
+		 * ->start_code, ->end_code aren't updated by munmap().
+		 * Output can be "0 0 0 2 0 0 0\n" where "2" can be anything.
+		 */
+
+		/* Cut the rope. */
+		int wstatus;
+		waitpid(pid, &wstatus, 0);
+		assert(WIFSTOPPED(wstatus));
+		assert(WSTOPSIG(wstatus) == SIGSEGV);
+	}
+
+	return rv;
+}
diff --git a/tools/testing/selftests/proc/proc-pid-vm.c b/tools/testing/selftests/proc/proc-pid-vm.c
index e5962f4794f5..69551bfa215c 100644
--- a/tools/testing/selftests/proc/proc-pid-vm.c
+++ b/tools/testing/selftests/proc/proc-pid-vm.c
@@ -213,22 +213,22 @@ static int make_exe(const uint8_t *payload, size_t len)
 
 /*
  * 0: vsyscall VMA doesn't exist	vsyscall=none
- * 1: vsyscall VMA is r-xp		vsyscall=emulate
- * 2: vsyscall VMA is --xp		vsyscall=xonly
+ * 1: vsyscall VMA is --xp		vsyscall=xonly
+ * 2: vsyscall VMA is r-xp		vsyscall=emulate
  */
-static int g_vsyscall;
+static volatile int g_vsyscall;
 static const char *str_vsyscall;
 
 static const char str_vsyscall_0[] = "";
 static const char str_vsyscall_1[] =
-"ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0                  [vsyscall]\n";
-static const char str_vsyscall_2[] =
 "ffffffffff600000-ffffffffff601000 --xp 00000000 00:00 0                  [vsyscall]\n";
+static const char str_vsyscall_2[] =
+"ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0                  [vsyscall]\n";
 
 #ifdef __x86_64__
 static void sigaction_SIGSEGV(int _, siginfo_t *__, void *___)
 {
-	_exit(1);
+	_exit(g_vsyscall);
 }
 
 /*
@@ -255,6 +255,7 @@ static void vsyscall(void)
 		act.sa_sigaction = sigaction_SIGSEGV;
 		(void)sigaction(SIGSEGV, &act, NULL);
 
+		g_vsyscall = 0;
 		/* gettimeofday(NULL, NULL); */
 		asm volatile (
 			"call %P0"
@@ -262,45 +263,20 @@ static void vsyscall(void)
 			: "i" (0xffffffffff600000), "D" (NULL), "S" (NULL)
 			: "rax", "rcx", "r11"
 		);
-		exit(0);
-	}
-	waitpid(pid, &wstatus, 0);
-	if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) == 0) {
-		/* vsyscall page exists and is executable. */
-	} else {
-		/* vsyscall page doesn't exist. */
-		g_vsyscall = 0;
-		return;
-	}
-
-	pid = fork();
-	if (pid < 0) {
-		fprintf(stderr, "fork, errno %d\n", errno);
-		exit(1);
-	}
-	if (pid == 0) {
-		struct rlimit rlim = {0, 0};
-		(void)setrlimit(RLIMIT_CORE, &rlim);
-
-		/* Hide "segfault at ffffffffff600000" messages. */
-		struct sigaction act;
-		memset(&act, 0, sizeof(struct sigaction));
-		act.sa_flags = SA_SIGINFO;
-		act.sa_sigaction = sigaction_SIGSEGV;
-		(void)sigaction(SIGSEGV, &act, NULL);
 
+		g_vsyscall = 1;
 		*(volatile int *)0xffffffffff600000UL;
-		exit(0);
+
+		g_vsyscall = 2;
+		exit(g_vsyscall);
 	}
 	waitpid(pid, &wstatus, 0);
-	if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) == 0) {
-		/* vsyscall page is readable and executable. */
-		g_vsyscall = 1;
-		return;
+	if (WIFEXITED(wstatus)) {
+		g_vsyscall = WEXITSTATUS(wstatus);
+	} else {
+		fprintf(stderr, "error: wstatus %08x\n", wstatus);
+		exit(1);
 	}
-
-	/* vsyscall page is executable but unreadable. */
-	g_vsyscall = 2;
 }
 
 int main(void)
diff --git a/tools/testing/selftests/tpm2/tpm2.py b/tools/testing/selftests/tpm2/tpm2.py
index 057a4f49c79d..c7363c6764fc 100644
--- a/tools/testing/selftests/tpm2/tpm2.py
+++ b/tools/testing/selftests/tpm2/tpm2.py
@@ -371,6 +371,10 @@ class Client:
             fcntl.fcntl(self.tpm, fcntl.F_SETFL, flags)
             self.tpm_poll = select.poll()
 
+    def __del__(self):
+        if self.tpm:
+            self.tpm.close()
+
     def close(self):
         self.tpm.close()
 
diff --git a/tools/testing/selftests/user_events/ftrace_test.c b/tools/testing/selftests/user_events/ftrace_test.c
index a80fb5ef61d5..404a2713dcae 100644
--- a/tools/testing/selftests/user_events/ftrace_test.c
+++ b/tools/testing/selftests/user_events/ftrace_test.c
@@ -22,6 +22,11 @@ const char *enable_file = "/sys/kernel/debug/tracing/events/user_events/__test_e
 const char *trace_file = "/sys/kernel/debug/tracing/trace";
 const char *fmt_file = "/sys/kernel/debug/tracing/events/user_events/__test_event/format";
 
+static inline int status_check(char *status_page, int status_bit)
+{
+	return status_page[status_bit >> 3] & (1 << (status_bit & 7));
+}
+
 static int trace_bytes(void)
 {
 	int fd = open(trace_file, O_RDONLY);
@@ -197,12 +202,12 @@ TEST_F(user, register_events) {
 	/* Register should work */
 	ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
 	ASSERT_EQ(0, reg.write_index);
-	ASSERT_NE(0, reg.status_index);
+	ASSERT_NE(0, reg.status_bit);
 
 	/* Multiple registers should result in same index */
 	ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
 	ASSERT_EQ(0, reg.write_index);
-	ASSERT_NE(0, reg.status_index);
+	ASSERT_NE(0, reg.status_bit);
 
 	/* Ensure disabled */
 	self->enable_fd = open(enable_file, O_RDWR);
@@ -212,15 +217,15 @@ TEST_F(user, register_events) {
 	/* MMAP should work and be zero'd */
 	ASSERT_NE(MAP_FAILED, status_page);
 	ASSERT_NE(NULL, status_page);
-	ASSERT_EQ(0, status_page[reg.status_index]);
+	ASSERT_EQ(0, status_check(status_page, reg.status_bit));
 
 	/* Enable event and ensure bits updated in status */
 	ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1")))
-	ASSERT_EQ(EVENT_STATUS_FTRACE, status_page[reg.status_index]);
+	ASSERT_NE(0, status_check(status_page, reg.status_bit));
 
 	/* Disable event and ensure bits updated in status */
 	ASSERT_NE(-1, write(self->enable_fd, "0", sizeof("0")))
-	ASSERT_EQ(0, status_page[reg.status_index]);
+	ASSERT_EQ(0, status_check(status_page, reg.status_bit));
 
 	/* File still open should return -EBUSY for delete */
 	ASSERT_EQ(-1, ioctl(self->data_fd, DIAG_IOCSDEL, "__test_event"));
@@ -240,6 +245,8 @@ TEST_F(user, write_events) {
 	struct iovec io[3];
 	__u32 field1, field2;
 	int before = 0, after = 0;
+	int page_size = sysconf(_SC_PAGESIZE);
+	char *status_page;
 
 	reg.size = sizeof(reg);
 	reg.name_args = (__u64)"__test_event u32 field1; u32 field2";
@@ -254,10 +261,18 @@ TEST_F(user, write_events) {
 	io[2].iov_base = &field2;
 	io[2].iov_len = sizeof(field2);
 
+	status_page = mmap(NULL, page_size, PROT_READ, MAP_SHARED,
+			   self->status_fd, 0);
+
 	/* Register should work */
 	ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
 	ASSERT_EQ(0, reg.write_index);
-	ASSERT_NE(0, reg.status_index);
+	ASSERT_NE(0, reg.status_bit);
+
+	/* MMAP should work and be zero'd */
+	ASSERT_NE(MAP_FAILED, status_page);
+	ASSERT_NE(NULL, status_page);
+	ASSERT_EQ(0, status_check(status_page, reg.status_bit));
 
 	/* Write should fail on invalid slot with ENOENT */
 	io[0].iov_base = &field2;
@@ -271,6 +286,9 @@ TEST_F(user, write_events) {
 	self->enable_fd = open(enable_file, O_RDWR);
 	ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1")))
 
+	/* Event should now be enabled */
+	ASSERT_NE(0, status_check(status_page, reg.status_bit));
+
 	/* Write should make it out to ftrace buffers */
 	before = trace_bytes();
 	ASSERT_NE(-1, writev(self->data_fd, (const struct iovec *)io, 3));
@@ -298,7 +316,7 @@ TEST_F(user, write_fault) {
 	/* Register should work */
 	ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
 	ASSERT_EQ(0, reg.write_index);
-	ASSERT_NE(0, reg.status_index);
+	ASSERT_NE(0, reg.status_bit);
 
 	/* Write should work normally */
 	ASSERT_NE(-1, writev(self->data_fd, (const struct iovec *)io, 2));
@@ -315,6 +333,11 @@ TEST_F(user, write_validator) {
 	int loc, bytes;
 	char data[8];
 	int before = 0, after = 0;
+	int page_size = sysconf(_SC_PAGESIZE);
+	char *status_page;
+
+	status_page = mmap(NULL, page_size, PROT_READ, MAP_SHARED,
+			   self->status_fd, 0);
 
 	reg.size = sizeof(reg);
 	reg.name_args = (__u64)"__test_event __rel_loc char[] data";
@@ -322,7 +345,12 @@ TEST_F(user, write_validator) {
 	/* Register should work */
 	ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
 	ASSERT_EQ(0, reg.write_index);
-	ASSERT_NE(0, reg.status_index);
+	ASSERT_NE(0, reg.status_bit);
+
+	/* MMAP should work and be zero'd */
+	ASSERT_NE(MAP_FAILED, status_page);
+	ASSERT_NE(NULL, status_page);
+	ASSERT_EQ(0, status_check(status_page, reg.status_bit));
 
 	io[0].iov_base = &reg.write_index;
 	io[0].iov_len = sizeof(reg.write_index);
@@ -340,6 +368,9 @@ TEST_F(user, write_validator) {
 	self->enable_fd = open(enable_file, O_RDWR);
 	ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1")))
 
+	/* Event should now be enabled */
+	ASSERT_NE(0, status_check(status_page, reg.status_bit));
+
 	/* Full in-bounds write should work */
 	before = trace_bytes();
 	loc = DYN_LOC(0, bytes);
diff --git a/tools/testing/selftests/user_events/perf_test.c b/tools/testing/selftests/user_events/perf_test.c
index 26851d51d6bb..8b4c7879d5a7 100644
--- a/tools/testing/selftests/user_events/perf_test.c
+++ b/tools/testing/selftests/user_events/perf_test.c
@@ -35,6 +35,11 @@ static long perf_event_open(struct perf_event_attr *pe, pid_t pid,
 	return syscall(__NR_perf_event_open, pe, pid, cpu, group_fd, flags);
 }
 
+static inline int status_check(char *status_page, int status_bit)
+{
+	return status_page[status_bit >> 3] & (1 << (status_bit & 7));
+}
+
 static int get_id(void)
 {
 	FILE *fp = fopen(id_file, "r");
@@ -120,8 +125,8 @@ TEST_F(user, perf_write) {
 	/* Register should work */
 	ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
 	ASSERT_EQ(0, reg.write_index);
-	ASSERT_NE(0, reg.status_index);
-	ASSERT_EQ(0, status_page[reg.status_index]);
+	ASSERT_NE(0, reg.status_bit);
+	ASSERT_EQ(0, status_check(status_page, reg.status_bit));
 
 	/* Id should be there */
 	id = get_id();
@@ -144,7 +149,7 @@ TEST_F(user, perf_write) {
 	ASSERT_NE(MAP_FAILED, perf_page);
 
 	/* Status should be updated */
-	ASSERT_EQ(EVENT_STATUS_PERF, status_page[reg.status_index]);
+	ASSERT_NE(0, status_check(status_page, reg.status_bit));
 
 	event.index = reg.write_index;
 	event.field1 = 0xc001;
diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
index 31e5eea2a9b9..7b9dc2426f18 100644
--- a/tools/testing/selftests/vm/.gitignore
+++ b/tools/testing/selftests/vm/.gitignore
@@ -30,7 +30,6 @@ map_fixed_noreplace
 write_to_hugetlbfs
 hmm-tests
 memfd_secret
-local_config.*
 soft-dirty
 split_huge_page_test
 ksm_tests
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index d516b8c38eed..163c2fde3cb3 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -1,9 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 # Makefile for vm selftests
 
-LOCAL_HDRS += $(selfdir)/vm/local_config.h $(top_srcdir)/mm/gup_test.h
-
-include local_config.mk
+LOCAL_HDRS += $(top_srcdir)/mm/gup_test.h
 
 uname_M := $(shell uname -m 2>/dev/null || echo not)
 MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/')
@@ -97,9 +95,11 @@ TEST_FILES += va_128TBswitch.sh
 
 include ../lib.mk
 
+$(OUTPUT)/khugepaged: vm_util.c
 $(OUTPUT)/madv_populate: vm_util.c
 $(OUTPUT)/soft-dirty: vm_util.c
 $(OUTPUT)/split_huge_page_test: vm_util.c
+$(OUTPUT)/userfaultfd: vm_util.c
 
 ifeq ($(MACHINE),x86_64)
 BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32))
@@ -152,23 +152,6 @@ endif
 
 $(OUTPUT)/mlock-random-test $(OUTPUT)/memfd_secret: LDLIBS += -lcap
 
-# HMM_EXTRA_LIBS may get set in local_config.mk, or it may be left empty.
-$(OUTPUT)/hmm-tests: LDLIBS += $(HMM_EXTRA_LIBS)
-
 $(OUTPUT)/ksm_tests: LDLIBS += -lnuma
 
 $(OUTPUT)/migration: LDLIBS += -lnuma
-
-local_config.mk local_config.h: check_config.sh
-	/bin/sh ./check_config.sh $(CC)
-
-EXTRA_CLEAN += local_config.mk local_config.h
-
-ifeq ($(HMM_EXTRA_LIBS),)
-all: warn_missing_hugelibs
-
-warn_missing_hugelibs:
-	@echo ; \
-	echo "Warning: missing libhugetlbfs support. Some HMM tests will be skipped." ; \
-	echo
-endif
diff --git a/tools/testing/selftests/vm/check_config.sh b/tools/testing/selftests/vm/check_config.sh
deleted file mode 100644
index 079c8a40b85d..000000000000
--- a/tools/testing/selftests/vm/check_config.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-#
-# Probe for libraries and create header files to record the results. Both C
-# header files and Makefile include fragments are created.
-
-OUTPUT_H_FILE=local_config.h
-OUTPUT_MKFILE=local_config.mk
-
-# libhugetlbfs
-tmpname=$(mktemp)
-tmpfile_c=${tmpname}.c
-tmpfile_o=${tmpname}.o
-
-echo "#include <sys/types.h>"        > $tmpfile_c
-echo "#include <hugetlbfs.h>"       >> $tmpfile_c
-echo "int func(void) { return 0; }" >> $tmpfile_c
-
-CC=${1:?"Usage: $0 <compiler> # example compiler: gcc"}
-$CC -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1
-
-if [ -f $tmpfile_o ]; then
-    echo "#define LOCAL_CONFIG_HAVE_LIBHUGETLBFS 1" > $OUTPUT_H_FILE
-    echo "HMM_EXTRA_LIBS = -lhugetlbfs"             > $OUTPUT_MKFILE
-else
-    echo "// No libhugetlbfs support found"      > $OUTPUT_H_FILE
-    echo "# No libhugetlbfs support found, so:"  > $OUTPUT_MKFILE
-    echo "HMM_EXTRA_LIBS = "                    >> $OUTPUT_MKFILE
-fi
-
-rm ${tmpname}.*
diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c
index 98b949c279be..4adaad1b822f 100644
--- a/tools/testing/selftests/vm/hmm-tests.c
+++ b/tools/testing/selftests/vm/hmm-tests.c
@@ -26,10 +26,6 @@
 #include <sys/mman.h>
 #include <sys/ioctl.h>
 
-#include "./local_config.h"
-#ifdef LOCAL_CONFIG_HAVE_LIBHUGETLBFS
-#include <hugetlbfs.h>
-#endif
 
 /*
  * This is a private UAPI to the kernel test module so it isn't exported
@@ -733,7 +729,54 @@ TEST_F(hmm, anon_write_huge)
 	hmm_buffer_free(buffer);
 }
 
-#ifdef LOCAL_CONFIG_HAVE_LIBHUGETLBFS
+/*
+ * Read numeric data from raw and tagged kernel status files.  Used to read
+ * /proc and /sys data (without a tag) and from /proc/meminfo (with a tag).
+ */
+static long file_read_ulong(char *file, const char *tag)
+{
+	int fd;
+	char buf[2048];
+	int len;
+	char *p, *q;
+	long val;
+
+	fd = open(file, O_RDONLY);
+	if (fd < 0) {
+		/* Error opening the file */
+		return -1;
+	}
+
+	len = read(fd, buf, sizeof(buf));
+	close(fd);
+	if (len < 0) {
+		/* Error in reading the file */
+		return -1;
+	}
+	if (len == sizeof(buf)) {
+		/* Error file is too large */
+		return -1;
+	}
+	buf[len] = '\0';
+
+	/* Search for a tag if provided */
+	if (tag) {
+		p = strstr(buf, tag);
+		if (!p)
+			return -1; /* looks like the line we want isn't there */
+		p += strlen(tag);
+	} else
+		p = buf;
+
+	val = strtol(p, &q, 0);
+	if (*q != ' ') {
+		/* Error parsing the file */
+		return -1;
+	}
+
+	return val;
+}
+
 /*
  * Write huge TLBFS page.
  */
@@ -742,29 +785,27 @@ TEST_F(hmm, anon_write_hugetlbfs)
 	struct hmm_buffer *buffer;
 	unsigned long npages;
 	unsigned long size;
+	unsigned long default_hsize;
 	unsigned long i;
 	int *ptr;
 	int ret;
-	long pagesizes[4];
-	int n, idx;
 
-	/* Skip test if we can't allocate a hugetlbfs page. */
-
-	n = gethugepagesizes(pagesizes, 4);
-	if (n <= 0)
+	default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:");
+	if (default_hsize < 0 || default_hsize*1024 < default_hsize)
 		SKIP(return, "Huge page size could not be determined");
-	for (idx = 0; --n > 0; ) {
-		if (pagesizes[n] < pagesizes[idx])
-			idx = n;
-	}
-	size = ALIGN(TWOMEG, pagesizes[idx]);
+	default_hsize = default_hsize*1024; /* KB to B */
+
+	size = ALIGN(TWOMEG, default_hsize);
 	npages = size >> self->page_shift;
 
 	buffer = malloc(sizeof(*buffer));
 	ASSERT_NE(buffer, NULL);
 
-	buffer->ptr = get_hugepage_region(size, GHR_STRICT);
-	if (buffer->ptr == NULL) {
+	buffer->ptr = mmap(NULL, size,
+				   PROT_READ | PROT_WRITE,
+				   MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+				   -1, 0);
+	if (buffer->ptr == MAP_FAILED) {
 		free(buffer);
 		SKIP(return, "Huge page could not be allocated");
 	}
@@ -788,11 +829,10 @@ TEST_F(hmm, anon_write_hugetlbfs)
 	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
 		ASSERT_EQ(ptr[i], i);
 
-	free_hugepage_region(buffer->ptr);
+	munmap(buffer->ptr, buffer->size);
 	buffer->ptr = NULL;
 	hmm_buffer_free(buffer);
 }
-#endif /* LOCAL_CONFIG_HAVE_LIBHUGETLBFS */
 
 /*
  * Read mmap'ed file memory.
@@ -1014,6 +1054,55 @@ TEST_F(hmm, migrate_fault)
 	hmm_buffer_free(buffer);
 }
 
+TEST_F(hmm, migrate_release)
+{
+	struct hmm_buffer *buffer;
+	unsigned long npages;
+	unsigned long size;
+	unsigned long i;
+	int *ptr;
+	int ret;
+
+	npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+	ASSERT_NE(npages, 0);
+	size = npages << self->page_shift;
+
+	buffer = malloc(sizeof(*buffer));
+	ASSERT_NE(buffer, NULL);
+
+	buffer->fd = -1;
+	buffer->size = size;
+	buffer->mirror = malloc(size);
+	ASSERT_NE(buffer->mirror, NULL);
+
+	buffer->ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
+			   MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0);
+	ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+	/* Initialize buffer in system memory. */
+	for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+		ptr[i] = i;
+
+	/* Migrate memory to device. */
+	ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(buffer->cpages, npages);
+
+	/* Check what the device read. */
+	for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+		ASSERT_EQ(ptr[i], i);
+
+	/* Release device memory. */
+	ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_RELEASE, buffer, npages);
+	ASSERT_EQ(ret, 0);
+
+	/* Fault pages back to system memory and check them. */
+	for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i)
+		ASSERT_EQ(ptr[i], i);
+
+	hmm_buffer_free(buffer);
+}
+
 /*
  * Migrate anonymous shared memory to device private memory.
  */
@@ -1467,7 +1556,6 @@ TEST_F(hmm2, snapshot)
 	hmm_buffer_free(buffer);
 }
 
-#ifdef LOCAL_CONFIG_HAVE_LIBHUGETLBFS
 /*
  * Test the hmm_range_fault() HMM_PFN_PMD flag for large pages that
  * should be mapped by a large page table entry.
@@ -1477,30 +1565,30 @@ TEST_F(hmm, compound)
 	struct hmm_buffer *buffer;
 	unsigned long npages;
 	unsigned long size;
+	unsigned long default_hsize;
 	int *ptr;
 	unsigned char *m;
 	int ret;
-	long pagesizes[4];
-	int n, idx;
 	unsigned long i;
 
 	/* Skip test if we can't allocate a hugetlbfs page. */
 
-	n = gethugepagesizes(pagesizes, 4);
-	if (n <= 0)
-		return;
-	for (idx = 0; --n > 0; ) {
-		if (pagesizes[n] < pagesizes[idx])
-			idx = n;
-	}
-	size = ALIGN(TWOMEG, pagesizes[idx]);
+	default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:");
+	if (default_hsize < 0 || default_hsize*1024 < default_hsize)
+		SKIP(return, "Huge page size could not be determined");
+	default_hsize = default_hsize*1024; /* KB to B */
+
+	size = ALIGN(TWOMEG, default_hsize);
 	npages = size >> self->page_shift;
 
 	buffer = malloc(sizeof(*buffer));
 	ASSERT_NE(buffer, NULL);
 
-	buffer->ptr = get_hugepage_region(size, GHR_STRICT);
-	if (buffer->ptr == NULL) {
+	buffer->ptr = mmap(NULL, size,
+				   PROT_READ | PROT_WRITE,
+				   MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+				   -1, 0);
+	if (buffer->ptr == MAP_FAILED) {
 		free(buffer);
 		return;
 	}
@@ -1539,11 +1627,10 @@ TEST_F(hmm, compound)
 		ASSERT_EQ(m[i], HMM_DMIRROR_PROT_READ |
 				HMM_DMIRROR_PROT_PMD);
 
-	free_hugepage_region(buffer->ptr);
+	munmap(buffer->ptr, buffer->size);
 	buffer->ptr = NULL;
 	hmm_buffer_free(buffer);
 }
-#endif /* LOCAL_CONFIG_HAVE_LIBHUGETLBFS */
 
 /*
  * Test two devices reading the same memory (double mapped).
diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c
index 155120b67a16..64126c8cd561 100644
--- a/tools/testing/selftests/vm/khugepaged.c
+++ b/tools/testing/selftests/vm/khugepaged.c
@@ -1,6 +1,9 @@
 #define _GNU_SOURCE
+#include <ctype.h>
+#include <errno.h>
 #include <fcntl.h>
 #include <limits.h>
+#include <dirent.h>
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -10,10 +13,24 @@
 
 #include <sys/mman.h>
 #include <sys/wait.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <sys/vfs.h>
+
+#include "linux/magic.h"
+
+#include "vm_util.h"
 
 #ifndef MADV_PAGEOUT
 #define MADV_PAGEOUT 21
 #endif
+#ifndef MADV_POPULATE_READ
+#define MADV_POPULATE_READ 22
+#endif
+#ifndef MADV_COLLAPSE
+#define MADV_COLLAPSE 25
+#endif
 
 #define BASE_ADDR ((void *)(1UL << 30))
 static unsigned long hpage_pmd_size;
@@ -22,6 +39,47 @@ static int hpage_pmd_nr;
 
 #define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/"
 #define PID_SMAPS "/proc/self/smaps"
+#define TEST_FILE "collapse_test_file"
+
+#define MAX_LINE_LENGTH 500
+
+enum vma_type {
+	VMA_ANON,
+	VMA_FILE,
+	VMA_SHMEM,
+};
+
+struct mem_ops {
+	void *(*setup_area)(int nr_hpages);
+	void (*cleanup_area)(void *p, unsigned long size);
+	void (*fault)(void *p, unsigned long start, unsigned long end);
+	bool (*check_huge)(void *addr, int nr_hpages);
+	const char *name;
+};
+
+static struct mem_ops *file_ops;
+static struct mem_ops *anon_ops;
+static struct mem_ops *shmem_ops;
+
+struct collapse_context {
+	void (*collapse)(const char *msg, char *p, int nr_hpages,
+			 struct mem_ops *ops, bool expect);
+	bool enforce_pte_scan_limits;
+	const char *name;
+};
+
+static struct collapse_context *khugepaged_context;
+static struct collapse_context *madvise_context;
+
+struct file_info {
+	const char *dir;
+	char path[PATH_MAX];
+	enum vma_type type;
+	int fd;
+	char dev_queue_read_ahead_path[PATH_MAX];
+};
+
+static struct file_info finfo;
 
 enum thp_enabled {
 	THP_ALWAYS,
@@ -88,18 +146,7 @@ struct settings {
 	enum shmem_enabled shmem_enabled;
 	bool use_zero_page;
 	struct khugepaged_settings khugepaged;
-};
-
-static struct settings default_settings = {
-	.thp_enabled = THP_MADVISE,
-	.thp_defrag = THP_DEFRAG_ALWAYS,
-	.shmem_enabled = SHMEM_NEVER,
-	.use_zero_page = 0,
-	.khugepaged = {
-		.defrag = 1,
-		.alloc_sleep_millisecs = 10,
-		.scan_sleep_millisecs = 10,
-	},
+	unsigned long read_ahead_kb;
 };
 
 static struct settings saved_settings;
@@ -118,6 +165,11 @@ static void fail(const char *msg)
 	exit_status++;
 }
 
+static void skip(const char *msg)
+{
+	printf(" \e[33m%s\e[0m\n", msg);
+}
+
 static int read_file(const char *path, char *buf, size_t buflen)
 {
 	int fd;
@@ -145,13 +197,19 @@ static int write_file(const char *path, const char *buf, size_t buflen)
 	ssize_t numwritten;
 
 	fd = open(path, O_WRONLY);
-	if (fd == -1)
+	if (fd == -1) {
+		printf("open(%s)\n", path);
+		exit(EXIT_FAILURE);
 		return 0;
+	}
 
 	numwritten = write(fd, buf, buflen - 1);
 	close(fd);
-	if (numwritten < 1)
+	if (numwritten < 1) {
+		printf("write(%s)\n", buf);
+		exit(EXIT_FAILURE);
 		return 0;
+	}
 
 	return (unsigned int) numwritten;
 }
@@ -218,20 +276,11 @@ static void write_string(const char *name, const char *val)
 	}
 }
 
-static const unsigned long read_num(const char *name)
+static const unsigned long _read_num(const char *path)
 {
-	char path[PATH_MAX];
 	char buf[21];
-	int ret;
-
-	ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
-	if (ret >= PATH_MAX) {
-		printf("%s: Pathname is too long\n", __func__);
-		exit(EXIT_FAILURE);
-	}
 
-	ret = read_file(path, buf, sizeof(buf));
-	if (ret < 0) {
+	if (read_file(path, buf, sizeof(buf)) < 0) {
 		perror("read_file(read_num)");
 		exit(EXIT_FAILURE);
 	}
@@ -239,10 +288,9 @@ static const unsigned long read_num(const char *name)
 	return strtoul(buf, NULL, 10);
 }
 
-static void write_num(const char *name, unsigned long num)
+static const unsigned long read_num(const char *name)
 {
 	char path[PATH_MAX];
-	char buf[21];
 	int ret;
 
 	ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
@@ -250,6 +298,12 @@ static void write_num(const char *name, unsigned long num)
 		printf("%s: Pathname is too long\n", __func__);
 		exit(EXIT_FAILURE);
 	}
+	return _read_num(path);
+}
+
+static void _write_num(const char *path, unsigned long num)
+{
+	char buf[21];
 
 	sprintf(buf, "%ld", num);
 	if (!write_file(path, buf, strlen(buf) + 1)) {
@@ -258,6 +312,19 @@ static void write_num(const char *name, unsigned long num)
 	}
 }
 
+static void write_num(const char *name, unsigned long num)
+{
+	char path[PATH_MAX];
+	int ret;
+
+	ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
+	if (ret >= PATH_MAX) {
+		printf("%s: Pathname is too long\n", __func__);
+		exit(EXIT_FAILURE);
+	}
+	_write_num(path, num);
+}
+
 static void write_settings(struct settings *settings)
 {
 	struct khugepaged_settings *khugepaged = &settings->khugepaged;
@@ -277,6 +344,43 @@ static void write_settings(struct settings *settings)
 	write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap);
 	write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared);
 	write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan);
+
+	if (file_ops && finfo.type == VMA_FILE)
+		_write_num(finfo.dev_queue_read_ahead_path,
+			   settings->read_ahead_kb);
+}
+
+#define MAX_SETTINGS_DEPTH 4
+static struct settings settings_stack[MAX_SETTINGS_DEPTH];
+static int settings_index;
+
+static struct settings *current_settings(void)
+{
+	if (!settings_index) {
+		printf("Fail: No settings set");
+		exit(EXIT_FAILURE);
+	}
+	return settings_stack + settings_index - 1;
+}
+
+static void push_settings(struct settings *settings)
+{
+	if (settings_index >= MAX_SETTINGS_DEPTH) {
+		printf("Fail: Settings stack exceeded");
+		exit(EXIT_FAILURE);
+	}
+	settings_stack[settings_index++] = *settings;
+	write_settings(current_settings());
+}
+
+static void pop_settings(void)
+{
+	if (settings_index <= 0) {
+		printf("Fail: Settings stack empty");
+		exit(EXIT_FAILURE);
+	}
+	--settings_index;
+	write_settings(current_settings());
 }
 
 static void restore_settings(int sig)
@@ -314,6 +418,10 @@ static void save_settings(void)
 		.max_ptes_shared = read_num("khugepaged/max_ptes_shared"),
 		.pages_to_scan = read_num("khugepaged/pages_to_scan"),
 	};
+	if (file_ops && finfo.type == VMA_FILE)
+		saved_settings.read_ahead_kb =
+				_read_num(finfo.dev_queue_read_ahead_path);
+
 	success("OK");
 
 	signal(SIGTERM, restore_settings);
@@ -322,72 +430,90 @@ static void save_settings(void)
 	signal(SIGQUIT, restore_settings);
 }
 
-static void adjust_settings(void)
+static void get_finfo(const char *dir)
 {
+	struct stat path_stat;
+	struct statfs fs;
+	char buf[1 << 10];
+	char path[PATH_MAX];
+	char *str, *end;
 
-	printf("Adjust settings...");
-	write_settings(&default_settings);
-	success("OK");
-}
-
-#define MAX_LINE_LENGTH 500
-
-static bool check_for_pattern(FILE *fp, char *pattern, char *buf)
-{
-	while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) {
-		if (!strncmp(buf, pattern, strlen(pattern)))
-			return true;
+	finfo.dir = dir;
+	stat(finfo.dir, &path_stat);
+	if (!S_ISDIR(path_stat.st_mode)) {
+		printf("%s: Not a directory (%s)\n", __func__, finfo.dir);
+		exit(EXIT_FAILURE);
 	}
-	return false;
-}
-
-static bool check_huge(void *addr)
-{
-	bool thp = false;
-	int ret;
-	FILE *fp;
-	char buffer[MAX_LINE_LENGTH];
-	char addr_pattern[MAX_LINE_LENGTH];
-
-	ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
-		       (unsigned long) addr);
-	if (ret >= MAX_LINE_LENGTH) {
-		printf("%s: Pattern is too long\n", __func__);
+	if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE,
+		     finfo.dir) >= sizeof(finfo.path)) {
+		printf("%s: Pathname is too long\n", __func__);
 		exit(EXIT_FAILURE);
 	}
-
-
-	fp = fopen(PID_SMAPS, "r");
-	if (!fp) {
-		printf("%s: Failed to open file %s\n", __func__, PID_SMAPS);
+	if (statfs(finfo.dir, &fs)) {
+		perror("statfs()");
 		exit(EXIT_FAILURE);
 	}
-	if (!check_for_pattern(fp, addr_pattern, buffer))
-		goto err_out;
-
-	ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "AnonHugePages:%10ld kB",
-		       hpage_pmd_size >> 10);
-	if (ret >= MAX_LINE_LENGTH) {
-		printf("%s: Pattern is too long\n", __func__);
+	finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE;
+	if (finfo.type == VMA_SHMEM)
+		return;
+
+	/* Find owning device's queue/read_ahead_kb control */
+	if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent",
+		     major(path_stat.st_dev), minor(path_stat.st_dev))
+	    >= sizeof(path)) {
+		printf("%s: Pathname is too long\n", __func__);
+		exit(EXIT_FAILURE);
+	}
+	if (read_file(path, buf, sizeof(buf)) < 0) {
+		perror("read_file(read_num)");
+		exit(EXIT_FAILURE);
+	}
+	if (strstr(buf, "DEVTYPE=disk")) {
+		/* Found it */
+		if (snprintf(finfo.dev_queue_read_ahead_path,
+			     sizeof(finfo.dev_queue_read_ahead_path),
+			     "/sys/dev/block/%d:%d/queue/read_ahead_kb",
+			     major(path_stat.st_dev), minor(path_stat.st_dev))
+		    >= sizeof(finfo.dev_queue_read_ahead_path)) {
+			printf("%s: Pathname is too long\n", __func__);
+			exit(EXIT_FAILURE);
+		}
+		return;
+	}
+	if (!strstr(buf, "DEVTYPE=partition")) {
+		printf("%s: Unknown device type: %s\n", __func__, path);
 		exit(EXIT_FAILURE);
 	}
 	/*
-	 * Fetch the AnonHugePages: in the same block and check whether it got
-	 * the expected number of hugeepages next.
+	 * Partition of block device - need to find actual device.
+	 * Using naming convention that devnameN is partition of
+	 * device devname.
 	 */
-	if (!check_for_pattern(fp, "AnonHugePages:", buffer))
-		goto err_out;
-
-	if (strncmp(buffer, addr_pattern, strlen(addr_pattern)))
-		goto err_out;
-
-	thp = true;
-err_out:
-	fclose(fp);
-	return thp;
+	str = strstr(buf, "DEVNAME=");
+	if (!str) {
+		printf("%s: Could not read: %s", __func__, path);
+		exit(EXIT_FAILURE);
+	}
+	str += 8;
+	end = str;
+	while (*end) {
+		if (isdigit(*end)) {
+			*end = '\0';
+			if (snprintf(finfo.dev_queue_read_ahead_path,
+				     sizeof(finfo.dev_queue_read_ahead_path),
+				     "/sys/block/%s/queue/read_ahead_kb",
+				     str) >= sizeof(finfo.dev_queue_read_ahead_path)) {
+				printf("%s: Pathname is too long\n", __func__);
+				exit(EXIT_FAILURE);
+			}
+			return;
+		}
+		++end;
+	}
+	printf("%s: Could not read: %s\n", __func__, path);
+	exit(EXIT_FAILURE);
 }
 
-
 static bool check_swap(void *addr, unsigned long size)
 {
 	bool swap = false;
@@ -409,7 +535,7 @@ static bool check_swap(void *addr, unsigned long size)
 		printf("%s: Failed to open file %s\n", __func__, PID_SMAPS);
 		exit(EXIT_FAILURE);
 	}
-	if (!check_for_pattern(fp, addr_pattern, buffer))
+	if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer)))
 		goto err_out;
 
 	ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB",
@@ -422,7 +548,7 @@ static bool check_swap(void *addr, unsigned long size)
 	 * Fetch the Swap: in the same block and check whether it got
 	 * the expected number of hugeepages next.
 	 */
-	if (!check_for_pattern(fp, "Swap:", buffer))
+	if (!check_for_pattern(fp, "Swap:", buffer, sizeof(buffer)))
 		goto err_out;
 
 	if (strncmp(buffer, addr_pattern, strlen(addr_pattern)))
@@ -434,12 +560,12 @@ err_out:
 	return swap;
 }
 
-static void *alloc_mapping(void)
+static void *alloc_mapping(int nr)
 {
 	void *p;
 
-	p = mmap(BASE_ADDR, hpage_pmd_size, PROT_READ | PROT_WRITE,
-			MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+	p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE,
+		 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
 	if (p != BASE_ADDR) {
 		printf("Failed to allocate VMA at %p\n", BASE_ADDR);
 		exit(EXIT_FAILURE);
@@ -456,6 +582,60 @@ static void fill_memory(int *p, unsigned long start, unsigned long end)
 		p[i * page_size / sizeof(*p)] = i + 0xdead0000;
 }
 
+/*
+ * MADV_COLLAPSE is a best-effort request and may fail if an internal
+ * resource is temporarily unavailable, in which case it will set errno to
+ * EAGAIN.  In such a case, immediately reattempt the operation one more
+ * time.
+ */
+static int madvise_collapse_retry(void *p, unsigned long size)
+{
+	bool retry = true;
+	int ret;
+
+retry:
+	ret = madvise(p, size, MADV_COLLAPSE);
+	if (ret && errno == EAGAIN && retry) {
+		retry = false;
+		goto retry;
+	}
+	return ret;
+}
+
+/*
+ * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with
+ * validate_memory()'able contents.
+ */
+static void *alloc_hpage(struct mem_ops *ops)
+{
+	void *p = ops->setup_area(1);
+
+	ops->fault(p, 0, hpage_pmd_size);
+
+	/*
+	 * VMA should be neither VM_HUGEPAGE nor VM_NOHUGEPAGE.
+	 * The latter is ineligible for collapse by MADV_COLLAPSE
+	 * while the former might cause MADV_COLLAPSE to race with
+	 * khugepaged on low-load system (like a test machine), which
+	 * would cause MADV_COLLAPSE to fail with EAGAIN.
+	 */
+	printf("Allocate huge page...");
+	if (madvise_collapse_retry(p, hpage_pmd_size)) {
+		perror("madvise(MADV_COLLAPSE)");
+		exit(EXIT_FAILURE);
+	}
+	if (!ops->check_huge(p, 1)) {
+		perror("madvise(MADV_COLLAPSE)");
+		exit(EXIT_FAILURE);
+	}
+	if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) {
+		perror("madvise(MADV_HUGEPAGE)");
+		exit(EXIT_FAILURE);
+	}
+	success("OK");
+	return p;
+}
+
 static void validate_memory(int *p, unsigned long start, unsigned long end)
 {
 	int i;
@@ -469,26 +649,216 @@ static void validate_memory(int *p, unsigned long start, unsigned long end)
 	}
 }
 
+static void *anon_setup_area(int nr_hpages)
+{
+	return alloc_mapping(nr_hpages);
+}
+
+static void anon_cleanup_area(void *p, unsigned long size)
+{
+	munmap(p, size);
+}
+
+static void anon_fault(void *p, unsigned long start, unsigned long end)
+{
+	fill_memory(p, start, end);
+}
+
+static bool anon_check_huge(void *addr, int nr_hpages)
+{
+	return check_huge_anon(addr, nr_hpages, hpage_pmd_size);
+}
+
+static void *file_setup_area(int nr_hpages)
+{
+	int fd;
+	void *p;
+	unsigned long size;
+
+	unlink(finfo.path);  /* Cleanup from previous failed tests */
+	printf("Creating %s for collapse%s...", finfo.path,
+	       finfo.type == VMA_SHMEM ? " (tmpfs)" : "");
+	fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL,
+		  777);
+	if (fd < 0) {
+		perror("open()");
+		exit(EXIT_FAILURE);
+	}
+
+	size = nr_hpages * hpage_pmd_size;
+	p = alloc_mapping(nr_hpages);
+	fill_memory(p, 0, size);
+	write(fd, p, size);
+	close(fd);
+	munmap(p, size);
+	success("OK");
+
+	printf("Opening %s read only for collapse...", finfo.path);
+	finfo.fd = open(finfo.path, O_RDONLY, 777);
+	if (finfo.fd < 0) {
+		perror("open()");
+		exit(EXIT_FAILURE);
+	}
+	p = mmap(BASE_ADDR, size, PROT_READ | PROT_EXEC,
+		 MAP_PRIVATE, finfo.fd, 0);
+	if (p == MAP_FAILED || p != BASE_ADDR) {
+		perror("mmap()");
+		exit(EXIT_FAILURE);
+	}
+
+	/* Drop page cache */
+	write_file("/proc/sys/vm/drop_caches", "3", 2);
+	success("OK");
+	return p;
+}
+
+static void file_cleanup_area(void *p, unsigned long size)
+{
+	munmap(p, size);
+	close(finfo.fd);
+	unlink(finfo.path);
+}
+
+static void file_fault(void *p, unsigned long start, unsigned long end)
+{
+	if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) {
+		perror("madvise(MADV_POPULATE_READ");
+		exit(EXIT_FAILURE);
+	}
+}
+
+static bool file_check_huge(void *addr, int nr_hpages)
+{
+	switch (finfo.type) {
+	case VMA_FILE:
+		return check_huge_file(addr, nr_hpages, hpage_pmd_size);
+	case VMA_SHMEM:
+		return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
+	default:
+		exit(EXIT_FAILURE);
+		return false;
+	}
+}
+
+static void *shmem_setup_area(int nr_hpages)
+{
+	void *p;
+	unsigned long size = nr_hpages * hpage_pmd_size;
+
+	finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0);
+	if (finfo.fd < 0)  {
+		perror("memfd_create()");
+		exit(EXIT_FAILURE);
+	}
+	if (ftruncate(finfo.fd, size)) {
+		perror("ftruncate()");
+		exit(EXIT_FAILURE);
+	}
+	p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd,
+		 0);
+	if (p != BASE_ADDR) {
+		perror("mmap()");
+		exit(EXIT_FAILURE);
+	}
+	return p;
+}
+
+static void shmem_cleanup_area(void *p, unsigned long size)
+{
+	munmap(p, size);
+	close(finfo.fd);
+}
+
+static bool shmem_check_huge(void *addr, int nr_hpages)
+{
+	return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
+}
+
+static struct mem_ops __anon_ops = {
+	.setup_area = &anon_setup_area,
+	.cleanup_area = &anon_cleanup_area,
+	.fault = &anon_fault,
+	.check_huge = &anon_check_huge,
+	.name = "anon",
+};
+
+static struct mem_ops __file_ops = {
+	.setup_area = &file_setup_area,
+	.cleanup_area = &file_cleanup_area,
+	.fault = &file_fault,
+	.check_huge = &file_check_huge,
+	.name = "file",
+};
+
+static struct mem_ops __shmem_ops = {
+	.setup_area = &shmem_setup_area,
+	.cleanup_area = &shmem_cleanup_area,
+	.fault = &anon_fault,
+	.check_huge = &shmem_check_huge,
+	.name = "shmem",
+};
+
+static void __madvise_collapse(const char *msg, char *p, int nr_hpages,
+			       struct mem_ops *ops, bool expect)
+{
+	int ret;
+	struct settings settings = *current_settings();
+
+	printf("%s...", msg);
+
+	/*
+	 * Prevent khugepaged interference and tests that MADV_COLLAPSE
+	 * ignores /sys/kernel/mm/transparent_hugepage/enabled
+	 */
+	settings.thp_enabled = THP_NEVER;
+	settings.shmem_enabled = SHMEM_NEVER;
+	push_settings(&settings);
+
+	/* Clear VM_NOHUGEPAGE */
+	madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
+	ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size);
+	if (((bool)ret) == expect)
+		fail("Fail: Bad return value");
+	else if (!ops->check_huge(p, expect ? nr_hpages : 0))
+		fail("Fail: check_huge()");
+	else
+		success("OK");
+
+	pop_settings();
+}
+
+static void madvise_collapse(const char *msg, char *p, int nr_hpages,
+			     struct mem_ops *ops, bool expect)
+{
+	/* Sanity check */
+	if (!ops->check_huge(p, 0)) {
+		printf("Unexpected huge page\n");
+		exit(EXIT_FAILURE);
+	}
+	__madvise_collapse(msg, p, nr_hpages, ops, expect);
+}
+
 #define TICK 500000
-static bool wait_for_scan(const char *msg, char *p)
+static bool wait_for_scan(const char *msg, char *p, int nr_hpages,
+			  struct mem_ops *ops)
 {
 	int full_scans;
 	int timeout = 6; /* 3 seconds */
 
 	/* Sanity check */
-	if (check_huge(p)) {
+	if (!ops->check_huge(p, 0)) {
 		printf("Unexpected huge page\n");
 		exit(EXIT_FAILURE);
 	}
 
-	madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
+	madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
 
 	/* Wait until the second full_scan completed */
 	full_scans = read_num("khugepaged/full_scans") + 2;
 
 	printf("%s...", msg);
 	while (timeout--) {
-		if (check_huge(p))
+		if (ops->check_huge(p, nr_hpages))
 			break;
 		if (read_num("khugepaged/full_scans") >= full_scans)
 			break;
@@ -496,122 +866,155 @@ static bool wait_for_scan(const char *msg, char *p)
 		usleep(TICK);
 	}
 
-	madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
+	madvise(p, nr_hpages * hpage_pmd_size, MADV_NOHUGEPAGE);
 
 	return timeout == -1;
 }
 
+static void khugepaged_collapse(const char *msg, char *p, int nr_hpages,
+				struct mem_ops *ops, bool expect)
+{
+	if (wait_for_scan(msg, p, nr_hpages, ops)) {
+		if (expect)
+			fail("Timeout");
+		else
+			success("OK");
+		return;
+	}
+
+	/*
+	 * For file and shmem memory, khugepaged only retracts pte entries after
+	 * putting the new hugepage in the page cache. The hugepage must be
+	 * subsequently refaulted to install the pmd mapping for the mm.
+	 */
+	if (ops != &__anon_ops)
+		ops->fault(p, 0, nr_hpages * hpage_pmd_size);
+
+	if (ops->check_huge(p, expect ? nr_hpages : 0))
+		success("OK");
+	else
+		fail("Fail");
+}
+
+static struct collapse_context __khugepaged_context = {
+	.collapse = &khugepaged_collapse,
+	.enforce_pte_scan_limits = true,
+	.name = "khugepaged",
+};
+
+static struct collapse_context __madvise_context = {
+	.collapse = &madvise_collapse,
+	.enforce_pte_scan_limits = false,
+	.name = "madvise",
+};
+
+static bool is_tmpfs(struct mem_ops *ops)
+{
+	return ops == &__file_ops && finfo.type == VMA_SHMEM;
+}
+
 static void alloc_at_fault(void)
 {
-	struct settings settings = default_settings;
+	struct settings settings = *current_settings();
 	char *p;
 
 	settings.thp_enabled = THP_ALWAYS;
-	write_settings(&settings);
+	push_settings(&settings);
 
-	p = alloc_mapping();
+	p = alloc_mapping(1);
 	*p = 1;
 	printf("Allocate huge page on fault...");
-	if (check_huge(p))
+	if (check_huge_anon(p, 1, hpage_pmd_size))
 		success("OK");
 	else
 		fail("Fail");
 
-	write_settings(&default_settings);
+	pop_settings();
 
 	madvise(p, page_size, MADV_DONTNEED);
 	printf("Split huge PMD on MADV_DONTNEED...");
-	if (!check_huge(p))
+	if (check_huge_anon(p, 0, hpage_pmd_size))
 		success("OK");
 	else
 		fail("Fail");
 	munmap(p, hpage_pmd_size);
 }
 
-static void collapse_full(void)
+static void collapse_full(struct collapse_context *c, struct mem_ops *ops)
 {
 	void *p;
-
-	p = alloc_mapping();
-	fill_memory(p, 0, hpage_pmd_size);
-	if (wait_for_scan("Collapse fully populated PTE table", p))
-		fail("Timeout");
-	else if (check_huge(p))
-		success("OK");
-	else
-		fail("Fail");
-	validate_memory(p, 0, hpage_pmd_size);
-	munmap(p, hpage_pmd_size);
+	int nr_hpages = 4;
+	unsigned long size = nr_hpages * hpage_pmd_size;
+
+	p = ops->setup_area(nr_hpages);
+	ops->fault(p, 0, size);
+	c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages,
+		    ops, true);
+	validate_memory(p, 0, size);
+	ops->cleanup_area(p, size);
 }
 
-static void collapse_empty(void)
+static void collapse_empty(struct collapse_context *c, struct mem_ops *ops)
 {
 	void *p;
 
-	p = alloc_mapping();
-	if (wait_for_scan("Do not collapse empty PTE table", p))
-		fail("Timeout");
-	else if (check_huge(p))
-		fail("Fail");
-	else
-		success("OK");
-	munmap(p, hpage_pmd_size);
+	p = ops->setup_area(1);
+	c->collapse("Do not collapse empty PTE table", p, 1, ops, false);
+	ops->cleanup_area(p, hpage_pmd_size);
 }
 
-static void collapse_single_pte_entry(void)
+static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops)
 {
 	void *p;
 
-	p = alloc_mapping();
-	fill_memory(p, 0, page_size);
-	if (wait_for_scan("Collapse PTE table with single PTE entry present", p))
-		fail("Timeout");
-	else if (check_huge(p))
-		success("OK");
-	else
-		fail("Fail");
-	validate_memory(p, 0, page_size);
-	munmap(p, hpage_pmd_size);
+	p = ops->setup_area(1);
+	ops->fault(p, 0, page_size);
+	c->collapse("Collapse PTE table with single PTE entry present", p,
+		    1, ops, true);
+	ops->cleanup_area(p, hpage_pmd_size);
 }
 
-static void collapse_max_ptes_none(void)
+static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops)
 {
 	int max_ptes_none = hpage_pmd_nr / 2;
-	struct settings settings = default_settings;
+	struct settings settings = *current_settings();
 	void *p;
 
 	settings.khugepaged.max_ptes_none = max_ptes_none;
-	write_settings(&settings);
+	push_settings(&settings);
 
-	p = alloc_mapping();
+	p = ops->setup_area(1);
 
-	fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
-	if (wait_for_scan("Do not collapse with max_ptes_none exceeded", p))
-		fail("Timeout");
-	else if (check_huge(p))
-		fail("Fail");
-	else
-		success("OK");
-	validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
+	if (is_tmpfs(ops)) {
+		/* shmem pages always in the page cache */
+		printf("tmpfs...");
+		skip("Skip");
+		goto skip;
+	}
 
-	fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
-	if (wait_for_scan("Collapse with max_ptes_none PTEs empty", p))
-		fail("Timeout");
-	else if (check_huge(p))
-		success("OK");
-	else
-		fail("Fail");
-	validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
+	ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
+	c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1,
+		    ops, !c->enforce_pte_scan_limits);
+	validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
 
-	munmap(p, hpage_pmd_size);
-	write_settings(&default_settings);
+	if (c->enforce_pte_scan_limits) {
+		ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
+		c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops,
+			    true);
+		validate_memory(p, 0,
+				(hpage_pmd_nr - max_ptes_none) * page_size);
+	}
+skip:
+	ops->cleanup_area(p, hpage_pmd_size);
+	pop_settings();
 }
 
-static void collapse_swapin_single_pte(void)
+static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops)
 {
 	void *p;
-	p = alloc_mapping();
-	fill_memory(p, 0, hpage_pmd_size);
+
+	p = ops->setup_area(1);
+	ops->fault(p, 0, hpage_pmd_size);
 
 	printf("Swapout one page...");
 	if (madvise(p, page_size, MADV_PAGEOUT)) {
@@ -625,25 +1028,21 @@ static void collapse_swapin_single_pte(void)
 		goto out;
 	}
 
-	if (wait_for_scan("Collapse with swapping in single PTE entry", p))
-		fail("Timeout");
-	else if (check_huge(p))
-		success("OK");
-	else
-		fail("Fail");
+	c->collapse("Collapse with swapping in single PTE entry", p, 1, ops,
+		    true);
 	validate_memory(p, 0, hpage_pmd_size);
 out:
-	munmap(p, hpage_pmd_size);
+	ops->cleanup_area(p, hpage_pmd_size);
 }
 
-static void collapse_max_ptes_swap(void)
+static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops)
 {
 	int max_ptes_swap = read_num("khugepaged/max_ptes_swap");
 	void *p;
 
-	p = alloc_mapping();
+	p = ops->setup_area(1);
+	ops->fault(p, 0, hpage_pmd_size);
 
-	fill_memory(p, 0, hpage_pmd_size);
 	printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr);
 	if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) {
 		perror("madvise(MADV_PAGEOUT)");
@@ -656,115 +1055,93 @@ static void collapse_max_ptes_swap(void)
 		goto out;
 	}
 
-	if (wait_for_scan("Do not collapse with max_ptes_swap exceeded", p))
-		fail("Timeout");
-	else if (check_huge(p))
-		fail("Fail");
-	else
-		success("OK");
+	c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, ops,
+		    !c->enforce_pte_scan_limits);
 	validate_memory(p, 0, hpage_pmd_size);
 
-	fill_memory(p, 0, hpage_pmd_size);
-	printf("Swapout %d of %d pages...", max_ptes_swap, hpage_pmd_nr);
-	if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) {
-		perror("madvise(MADV_PAGEOUT)");
-		exit(EXIT_FAILURE);
-	}
-	if (check_swap(p, max_ptes_swap * page_size)) {
-		success("OK");
-	} else {
-		fail("Fail");
-		goto out;
-	}
+	if (c->enforce_pte_scan_limits) {
+		ops->fault(p, 0, hpage_pmd_size);
+		printf("Swapout %d of %d pages...", max_ptes_swap,
+		       hpage_pmd_nr);
+		if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) {
+			perror("madvise(MADV_PAGEOUT)");
+			exit(EXIT_FAILURE);
+		}
+		if (check_swap(p, max_ptes_swap * page_size)) {
+			success("OK");
+		} else {
+			fail("Fail");
+			goto out;
+		}
 
-	if (wait_for_scan("Collapse with max_ptes_swap pages swapped out", p))
-		fail("Timeout");
-	else if (check_huge(p))
-		success("OK");
-	else
-		fail("Fail");
-	validate_memory(p, 0, hpage_pmd_size);
+		c->collapse("Collapse with max_ptes_swap pages swapped out", p,
+			    1, ops, true);
+		validate_memory(p, 0, hpage_pmd_size);
+	}
 out:
-	munmap(p, hpage_pmd_size);
+	ops->cleanup_area(p, hpage_pmd_size);
 }
 
-static void collapse_single_pte_entry_compound(void)
+static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops)
 {
 	void *p;
 
-	p = alloc_mapping();
+	p = alloc_hpage(ops);
 
-	printf("Allocate huge page...");
-	madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
-	fill_memory(p, 0, hpage_pmd_size);
-	if (check_huge(p))
-		success("OK");
-	else
-		fail("Fail");
-	madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
+	if (is_tmpfs(ops)) {
+		/* MADV_DONTNEED won't evict tmpfs pages */
+		printf("tmpfs...");
+		skip("Skip");
+		goto skip;
+	}
 
+	madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
 	printf("Split huge page leaving single PTE mapping compound page...");
 	madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED);
-	if (!check_huge(p))
+	if (ops->check_huge(p, 0))
 		success("OK");
 	else
 		fail("Fail");
 
-	if (wait_for_scan("Collapse PTE table with single PTE mapping compound page", p))
-		fail("Timeout");
-	else if (check_huge(p))
-		success("OK");
-	else
-		fail("Fail");
+	c->collapse("Collapse PTE table with single PTE mapping compound page",
+		    p, 1, ops, true);
 	validate_memory(p, 0, page_size);
-	munmap(p, hpage_pmd_size);
+skip:
+	ops->cleanup_area(p, hpage_pmd_size);
 }
 
-static void collapse_full_of_compound(void)
+static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops)
 {
 	void *p;
 
-	p = alloc_mapping();
-
-	printf("Allocate huge page...");
-	madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
-	fill_memory(p, 0, hpage_pmd_size);
-	if (check_huge(p))
-		success("OK");
-	else
-		fail("Fail");
-
+	p = alloc_hpage(ops);
 	printf("Split huge page leaving single PTE page table full of compound pages...");
 	madvise(p, page_size, MADV_NOHUGEPAGE);
 	madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
-	if (!check_huge(p))
+	if (ops->check_huge(p, 0))
 		success("OK");
 	else
 		fail("Fail");
 
-	if (wait_for_scan("Collapse PTE table full of compound pages", p))
-		fail("Timeout");
-	else if (check_huge(p))
-		success("OK");
-	else
-		fail("Fail");
+	c->collapse("Collapse PTE table full of compound pages", p, 1, ops,
+		    true);
 	validate_memory(p, 0, hpage_pmd_size);
-	munmap(p, hpage_pmd_size);
+	ops->cleanup_area(p, hpage_pmd_size);
 }
 
-static void collapse_compound_extreme(void)
+static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops)
 {
 	void *p;
 	int i;
 
-	p = alloc_mapping();
+	p = ops->setup_area(1);
 	for (i = 0; i < hpage_pmd_nr; i++) {
 		printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...",
 				i + 1, hpage_pmd_nr);
 
 		madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE);
-		fill_memory(BASE_ADDR, 0, hpage_pmd_size);
-		if (!check_huge(BASE_ADDR)) {
+		ops->fault(BASE_ADDR, 0, hpage_pmd_size);
+		if (!ops->check_huge(BASE_ADDR, 1)) {
 			printf("Failed to allocate huge page\n");
 			exit(EXIT_FAILURE);
 		}
@@ -791,34 +1168,30 @@ static void collapse_compound_extreme(void)
 		}
 	}
 
-	munmap(BASE_ADDR, hpage_pmd_size);
-	fill_memory(p, 0, hpage_pmd_size);
-	if (!check_huge(p))
+	ops->cleanup_area(BASE_ADDR, hpage_pmd_size);
+	ops->fault(p, 0, hpage_pmd_size);
+	if (!ops->check_huge(p, 1))
 		success("OK");
 	else
 		fail("Fail");
 
-	if (wait_for_scan("Collapse PTE table full of different compound pages", p))
-		fail("Timeout");
-	else if (check_huge(p))
-		success("OK");
-	else
-		fail("Fail");
+	c->collapse("Collapse PTE table full of different compound pages", p, 1,
+		    ops, true);
 
 	validate_memory(p, 0, hpage_pmd_size);
-	munmap(p, hpage_pmd_size);
+	ops->cleanup_area(p, hpage_pmd_size);
 }
 
-static void collapse_fork(void)
+static void collapse_fork(struct collapse_context *c, struct mem_ops *ops)
 {
 	int wstatus;
 	void *p;
 
-	p = alloc_mapping();
+	p = ops->setup_area(1);
 
 	printf("Allocate small page...");
-	fill_memory(p, 0, page_size);
-	if (!check_huge(p))
+	ops->fault(p, 0, page_size);
+	if (ops->check_huge(p, 0))
 		success("OK");
 	else
 		fail("Fail");
@@ -829,22 +1202,17 @@ static void collapse_fork(void)
 		skip_settings_restore = true;
 		exit_status = 0;
 
-		if (!check_huge(p))
+		if (ops->check_huge(p, 0))
 			success("OK");
 		else
 			fail("Fail");
 
-		fill_memory(p, page_size, 2 * page_size);
-
-		if (wait_for_scan("Collapse PTE table with single page shared with parent process", p))
-			fail("Timeout");
-		else if (check_huge(p))
-			success("OK");
-		else
-			fail("Fail");
+		ops->fault(p, page_size, 2 * page_size);
+		c->collapse("Collapse PTE table with single page shared with parent process",
+			    p, 1, ops, true);
 
 		validate_memory(p, 0, page_size);
-		munmap(p, hpage_pmd_size);
+		ops->cleanup_area(p, hpage_pmd_size);
 		exit(exit_status);
 	}
 
@@ -852,36 +1220,27 @@ static void collapse_fork(void)
 	exit_status += WEXITSTATUS(wstatus);
 
 	printf("Check if parent still has small page...");
-	if (!check_huge(p))
+	if (ops->check_huge(p, 0))
 		success("OK");
 	else
 		fail("Fail");
 	validate_memory(p, 0, page_size);
-	munmap(p, hpage_pmd_size);
+	ops->cleanup_area(p, hpage_pmd_size);
 }
 
-static void collapse_fork_compound(void)
+static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops)
 {
 	int wstatus;
 	void *p;
 
-	p = alloc_mapping();
-
-	printf("Allocate huge page...");
-	madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
-	fill_memory(p, 0, hpage_pmd_size);
-	if (check_huge(p))
-		success("OK");
-	else
-		fail("Fail");
-
+	p = alloc_hpage(ops);
 	printf("Share huge page over fork()...");
 	if (!fork()) {
 		/* Do not touch settings on child exit */
 		skip_settings_restore = true;
 		exit_status = 0;
 
-		if (check_huge(p))
+		if (ops->check_huge(p, 1))
 			success("OK");
 		else
 			fail("Fail");
@@ -889,24 +1248,20 @@ static void collapse_fork_compound(void)
 		printf("Split huge page PMD in child process...");
 		madvise(p, page_size, MADV_NOHUGEPAGE);
 		madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
-		if (!check_huge(p))
+		if (ops->check_huge(p, 0))
 			success("OK");
 		else
 			fail("Fail");
-		fill_memory(p, 0, page_size);
+		ops->fault(p, 0, page_size);
 
 		write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1);
-		if (wait_for_scan("Collapse PTE table full of compound pages in child", p))
-			fail("Timeout");
-		else if (check_huge(p))
-			success("OK");
-		else
-			fail("Fail");
+		c->collapse("Collapse PTE table full of compound pages in child",
+			    p, 1, ops, true);
 		write_num("khugepaged/max_ptes_shared",
-				default_settings.khugepaged.max_ptes_shared);
+			  current_settings()->khugepaged.max_ptes_shared);
 
 		validate_memory(p, 0, hpage_pmd_size);
-		munmap(p, hpage_pmd_size);
+		ops->cleanup_area(p, hpage_pmd_size);
 		exit(exit_status);
 	}
 
@@ -914,74 +1269,59 @@ static void collapse_fork_compound(void)
 	exit_status += WEXITSTATUS(wstatus);
 
 	printf("Check if parent still has huge page...");
-	if (check_huge(p))
+	if (ops->check_huge(p, 1))
 		success("OK");
 	else
 		fail("Fail");
 	validate_memory(p, 0, hpage_pmd_size);
-	munmap(p, hpage_pmd_size);
+	ops->cleanup_area(p, hpage_pmd_size);
 }
 
-static void collapse_max_ptes_shared()
+static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops)
 {
 	int max_ptes_shared = read_num("khugepaged/max_ptes_shared");
 	int wstatus;
 	void *p;
 
-	p = alloc_mapping();
-
-	printf("Allocate huge page...");
-	madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
-	fill_memory(p, 0, hpage_pmd_size);
-	if (check_huge(p))
-		success("OK");
-	else
-		fail("Fail");
-
+	p = alloc_hpage(ops);
 	printf("Share huge page over fork()...");
 	if (!fork()) {
 		/* Do not touch settings on child exit */
 		skip_settings_restore = true;
 		exit_status = 0;
 
-		if (check_huge(p))
+		if (ops->check_huge(p, 1))
 			success("OK");
 		else
 			fail("Fail");
 
 		printf("Trigger CoW on page %d of %d...",
 				hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr);
-		fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size);
-		if (!check_huge(p))
-			success("OK");
-		else
-			fail("Fail");
-
-		if (wait_for_scan("Do not collapse with max_ptes_shared exceeded", p))
-			fail("Timeout");
-		else if (!check_huge(p))
-			success("OK");
-		else
-			fail("Fail");
-
-		printf("Trigger CoW on page %d of %d...",
-				hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
-		fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared) * page_size);
-		if (!check_huge(p))
+		ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size);
+		if (ops->check_huge(p, 0))
 			success("OK");
 		else
 			fail("Fail");
 
-
-		if (wait_for_scan("Collapse with max_ptes_shared PTEs shared", p))
-			fail("Timeout");
-		else if (check_huge(p))
-			success("OK");
-		else
-			fail("Fail");
+		c->collapse("Maybe collapse with max_ptes_shared exceeded", p,
+			    1, ops, !c->enforce_pte_scan_limits);
+
+		if (c->enforce_pte_scan_limits) {
+			printf("Trigger CoW on page %d of %d...",
+			       hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
+			ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) *
+				    page_size);
+			if (ops->check_huge(p, 0))
+				success("OK");
+			else
+				fail("Fail");
+
+			c->collapse("Collapse with max_ptes_shared PTEs shared",
+				    p, 1, ops, true);
+		}
 
 		validate_memory(p, 0, hpage_pmd_size);
-		munmap(p, hpage_pmd_size);
+		ops->cleanup_area(p, hpage_pmd_size);
 		exit(exit_status);
 	}
 
@@ -989,20 +1329,153 @@ static void collapse_max_ptes_shared()
 	exit_status += WEXITSTATUS(wstatus);
 
 	printf("Check if parent still has huge page...");
-	if (check_huge(p))
+	if (ops->check_huge(p, 1))
 		success("OK");
 	else
 		fail("Fail");
 	validate_memory(p, 0, hpage_pmd_size);
-	munmap(p, hpage_pmd_size);
+	ops->cleanup_area(p, hpage_pmd_size);
 }
 
-int main(void)
+static void madvise_collapse_existing_thps(struct collapse_context *c,
+					   struct mem_ops *ops)
 {
+	void *p;
+
+	p = ops->setup_area(1);
+	ops->fault(p, 0, hpage_pmd_size);
+	c->collapse("Collapse fully populated PTE table...", p, 1, ops, true);
+	validate_memory(p, 0, hpage_pmd_size);
+
+	/* c->collapse() will find a hugepage and complain - call directly. */
+	__madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true);
+	validate_memory(p, 0, hpage_pmd_size);
+	ops->cleanup_area(p, hpage_pmd_size);
+}
+
+/*
+ * Test race with khugepaged where page tables have been retracted and
+ * pmd cleared.
+ */
+static void madvise_retracted_page_tables(struct collapse_context *c,
+					  struct mem_ops *ops)
+{
+	void *p;
+	int nr_hpages = 1;
+	unsigned long size = nr_hpages * hpage_pmd_size;
+
+	p = ops->setup_area(nr_hpages);
+	ops->fault(p, 0, size);
+
+	/* Let khugepaged collapse and leave pmd cleared */
+	if (wait_for_scan("Collapse and leave PMD cleared", p, nr_hpages,
+			  ops)) {
+		fail("Timeout");
+		return;
+	}
+	success("OK");
+	c->collapse("Install huge PMD from page cache", p, nr_hpages, ops,
+		    true);
+	validate_memory(p, 0, size);
+	ops->cleanup_area(p, size);
+}
+
+static void usage(void)
+{
+	fprintf(stderr, "\nUsage: ./khugepaged <test type> [dir]\n\n");
+	fprintf(stderr, "\t<test type>\t: <context>:<mem_type>\n");
+	fprintf(stderr, "\t<context>\t: [all|khugepaged|madvise]\n");
+	fprintf(stderr, "\t<mem_type>\t: [all|anon|file|shmem]\n");
+	fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n");
+	fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n");
+	fprintf(stderr,	"\tCONFIG_READ_ONLY_THP_FOR_FS=y\n");
+	fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n");
+	fprintf(stderr,	"\tmounted with huge=madvise option for khugepaged tests to work\n");
+	exit(1);
+}
+
+static void parse_test_type(int argc, const char **argv)
+{
+	char *buf;
+	const char *token;
+
+	if (argc == 1) {
+		/* Backwards compatibility */
+		khugepaged_context =  &__khugepaged_context;
+		madvise_context =  &__madvise_context;
+		anon_ops = &__anon_ops;
+		return;
+	}
+
+	buf = strdup(argv[1]);
+	token = strsep(&buf, ":");
+
+	if (!strcmp(token, "all")) {
+		khugepaged_context =  &__khugepaged_context;
+		madvise_context =  &__madvise_context;
+	} else if (!strcmp(token, "khugepaged")) {
+		khugepaged_context =  &__khugepaged_context;
+	} else if (!strcmp(token, "madvise")) {
+		madvise_context =  &__madvise_context;
+	} else {
+		usage();
+	}
+
+	if (!buf)
+		usage();
+
+	if (!strcmp(buf, "all")) {
+		file_ops =  &__file_ops;
+		anon_ops = &__anon_ops;
+		shmem_ops = &__shmem_ops;
+	} else if (!strcmp(buf, "anon")) {
+		anon_ops = &__anon_ops;
+	} else if (!strcmp(buf, "file")) {
+		file_ops =  &__file_ops;
+	} else if (!strcmp(buf, "shmem")) {
+		shmem_ops = &__shmem_ops;
+	} else {
+		usage();
+	}
+
+	if (!file_ops)
+		return;
+
+	if (argc != 3)
+		usage();
+}
+
+int main(int argc, const char **argv)
+{
+	struct settings default_settings = {
+		.thp_enabled = THP_MADVISE,
+		.thp_defrag = THP_DEFRAG_ALWAYS,
+		.shmem_enabled = SHMEM_ADVISE,
+		.use_zero_page = 0,
+		.khugepaged = {
+			.defrag = 1,
+			.alloc_sleep_millisecs = 10,
+			.scan_sleep_millisecs = 10,
+		},
+		/*
+		 * When testing file-backed memory, the collapse path
+		 * looks at how many pages are found in the page cache, not
+		 * what pages are mapped. Disable read ahead optimization so
+		 * pages don't find their way into the page cache unless
+		 * we mem_ops->fault() them in.
+		 */
+		.read_ahead_kb = 0,
+	};
+
+	parse_test_type(argc, argv);
+
+	if (file_ops)
+		get_finfo(argv[2]);
+
 	setbuf(stdout, NULL);
 
 	page_size = getpagesize();
-	hpage_pmd_size = read_num("hpage_pmd_size");
+	hpage_pmd_size = read_pmd_pagesize();
 	hpage_pmd_nr = hpage_pmd_size / page_size;
 
 	default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1;
@@ -1011,21 +1484,75 @@ int main(void)
 	default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;
 
 	save_settings();
-	adjust_settings();
+	push_settings(&default_settings);
 
 	alloc_at_fault();
-	collapse_full();
-	collapse_empty();
-	collapse_single_pte_entry();
-	collapse_max_ptes_none();
-	collapse_swapin_single_pte();
-	collapse_max_ptes_swap();
-	collapse_single_pte_entry_compound();
-	collapse_full_of_compound();
-	collapse_compound_extreme();
-	collapse_fork();
-	collapse_fork_compound();
-	collapse_max_ptes_shared();
+
+#define TEST(t, c, o) do { \
+	if (c && o) { \
+		printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \
+		t(c, o); \
+	} \
+	} while (0)
+
+	TEST(collapse_full, khugepaged_context, anon_ops);
+	TEST(collapse_full, khugepaged_context, file_ops);
+	TEST(collapse_full, khugepaged_context, shmem_ops);
+	TEST(collapse_full, madvise_context, anon_ops);
+	TEST(collapse_full, madvise_context, file_ops);
+	TEST(collapse_full, madvise_context, shmem_ops);
+
+	TEST(collapse_empty, khugepaged_context, anon_ops);
+	TEST(collapse_empty, madvise_context, anon_ops);
+
+	TEST(collapse_single_pte_entry, khugepaged_context, anon_ops);
+	TEST(collapse_single_pte_entry, khugepaged_context, file_ops);
+	TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops);
+	TEST(collapse_single_pte_entry, madvise_context, anon_ops);
+	TEST(collapse_single_pte_entry, madvise_context, file_ops);
+	TEST(collapse_single_pte_entry, madvise_context, shmem_ops);
+
+	TEST(collapse_max_ptes_none, khugepaged_context, anon_ops);
+	TEST(collapse_max_ptes_none, khugepaged_context, file_ops);
+	TEST(collapse_max_ptes_none, madvise_context, anon_ops);
+	TEST(collapse_max_ptes_none, madvise_context, file_ops);
+
+	TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops);
+	TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops);
+	TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops);
+	TEST(collapse_single_pte_entry_compound, madvise_context, file_ops);
+
+	TEST(collapse_full_of_compound, khugepaged_context, anon_ops);
+	TEST(collapse_full_of_compound, khugepaged_context, file_ops);
+	TEST(collapse_full_of_compound, khugepaged_context, shmem_ops);
+	TEST(collapse_full_of_compound, madvise_context, anon_ops);
+	TEST(collapse_full_of_compound, madvise_context, file_ops);
+	TEST(collapse_full_of_compound, madvise_context, shmem_ops);
+
+	TEST(collapse_compound_extreme, khugepaged_context, anon_ops);
+	TEST(collapse_compound_extreme, madvise_context, anon_ops);
+
+	TEST(collapse_swapin_single_pte, khugepaged_context, anon_ops);
+	TEST(collapse_swapin_single_pte, madvise_context, anon_ops);
+
+	TEST(collapse_max_ptes_swap, khugepaged_context, anon_ops);
+	TEST(collapse_max_ptes_swap, madvise_context, anon_ops);
+
+	TEST(collapse_fork, khugepaged_context, anon_ops);
+	TEST(collapse_fork, madvise_context, anon_ops);
+
+	TEST(collapse_fork_compound, khugepaged_context, anon_ops);
+	TEST(collapse_fork_compound, madvise_context, anon_ops);
+
+	TEST(collapse_max_ptes_shared, khugepaged_context, anon_ops);
+	TEST(collapse_max_ptes_shared, madvise_context, anon_ops);
+
+	TEST(madvise_collapse_existing_thps, madvise_context, anon_ops);
+	TEST(madvise_collapse_existing_thps, madvise_context, file_ops);
+	TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops);
+
+	TEST(madvise_retracted_page_tables, madvise_context, file_ops);
+	TEST(madvise_retracted_page_tables, madvise_context, shmem_ops);
 
 	restore_settings(0);
 }
diff --git a/tools/testing/selftests/vm/mremap_test.c b/tools/testing/selftests/vm/mremap_test.c
index db0270127aeb..9496346973d4 100644
--- a/tools/testing/selftests/vm/mremap_test.c
+++ b/tools/testing/selftests/vm/mremap_test.c
@@ -119,6 +119,50 @@ static unsigned long long get_mmap_min_addr(void)
 }
 
 /*
+ * This test validates that merge is called when expanding a mapping.
+ * Mapping containing three pages is created, middle page is unmapped
+ * and then the mapping containing the first page is expanded so that
+ * it fills the created hole. The two parts should merge creating
+ * single mapping with three pages.
+ */
+static void mremap_expand_merge(unsigned long page_size)
+{
+	char *test_name = "mremap expand merge";
+	FILE *fp;
+	char *line = NULL;
+	size_t len = 0;
+	bool success = false;
+	char *start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE,
+			   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+	munmap(start + page_size, page_size);
+	mremap(start, page_size, 2 * page_size, 0);
+
+	fp = fopen("/proc/self/maps", "r");
+	if (fp == NULL) {
+		ksft_test_result_fail("%s\n", test_name);
+		return;
+	}
+
+	while (getline(&line, &len, fp) != -1) {
+		char *first = strtok(line, "- ");
+		void *first_val = (void *)strtol(first, NULL, 16);
+		char *second = strtok(NULL, "- ");
+		void *second_val = (void *) strtol(second, NULL, 16);
+
+		if (first_val == start && second_val == start + 3 * page_size) {
+			success = true;
+			break;
+		}
+	}
+	if (success)
+		ksft_test_result_pass("%s\n", test_name);
+	else
+		ksft_test_result_fail("%s\n", test_name);
+	fclose(fp);
+}
+
+/*
  * Returns the start address of the mapping on success, else returns
  * NULL on failure.
  */
@@ -336,6 +380,7 @@ int main(int argc, char **argv)
 	int i, run_perf_tests;
 	unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD;
 	unsigned int pattern_seed;
+	int num_expand_tests = 1;
 	struct test test_cases[MAX_TEST];
 	struct test perf_test_cases[MAX_PERF_TEST];
 	int page_size;
@@ -407,12 +452,14 @@ int main(int argc, char **argv)
 				(threshold_mb * _1MB >= _1GB);
 
 	ksft_set_plan(ARRAY_SIZE(test_cases) + (run_perf_tests ?
-		      ARRAY_SIZE(perf_test_cases) : 0));
+		      ARRAY_SIZE(perf_test_cases) : 0) + num_expand_tests);
 
 	for (i = 0; i < ARRAY_SIZE(test_cases); i++)
 		run_mremap_test_case(test_cases[i], &failures, threshold_mb,
 				     pattern_seed);
 
+	mremap_expand_merge(page_size);
+
 	if (run_perf_tests) {
 		ksft_print_msg("\n%s\n",
 		 "mremap HAVE_MOVE_PMD/PUD optimization time comparison for 1GB region:");
diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh
index de86983b8a0f..e780e76c26b8 100755
--- a/tools/testing/selftests/vm/run_vmtests.sh
+++ b/tools/testing/selftests/vm/run_vmtests.sh
@@ -120,11 +120,16 @@ run_test ./gup_test -a
 # Dump pages 0, 19, and 4096, using pin_user_pages:
 run_test ./gup_test -ct -F 0x1 0 19 0x1000
 
-run_test ./userfaultfd anon 20 16
-# Test requires source and destination huge pages.  Size of source
-# (half_ufd_size_MB) is passed as argument to test.
-run_test ./userfaultfd hugetlb "$half_ufd_size_MB" 32
-run_test ./userfaultfd shmem 20 16
+uffd_mods=("" ":dev")
+for mod in "${uffd_mods[@]}"; do
+	run_test ./userfaultfd anon${mod} 20 16
+	# Hugetlb tests require source and destination huge pages. Pass in half
+	# the size ($half_ufd_size_MB), which is used for *each*.
+	run_test ./userfaultfd hugetlb${mod} "$half_ufd_size_MB" 32
+	run_test ./userfaultfd hugetlb_shared${mod} "$half_ufd_size_MB" 32 "$mnt"/uffd-test
+	rm -f "$mnt"/uffd-test
+	run_test ./userfaultfd shmem${mod} 20 16
+done
 
 #cleanup
 umount "$mnt"
diff --git a/tools/testing/selftests/vm/soft-dirty.c b/tools/testing/selftests/vm/soft-dirty.c
index e3a43f5d4fa2..21d8830c5f24 100644
--- a/tools/testing/selftests/vm/soft-dirty.c
+++ b/tools/testing/selftests/vm/soft-dirty.c
@@ -91,7 +91,7 @@ static void test_hugepage(int pagemap_fd, int pagesize)
 	for (i = 0; i < hpage_len; i++)
 		map[i] = (char)i;
 
-	if (check_huge(map)) {
+	if (check_huge_anon(map, 1, hpage_len)) {
 		ksft_test_result_pass("Test %s huge page allocation\n", __func__);
 
 		clear_softdirty();
diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c
index 6aa2b8253aed..76e1c36dd9e5 100644
--- a/tools/testing/selftests/vm/split_huge_page_test.c
+++ b/tools/testing/selftests/vm/split_huge_page_test.c
@@ -92,7 +92,6 @@ void split_pmd_thp(void)
 {
 	char *one_page;
 	size_t len = 4 * pmd_pagesize;
-	uint64_t thp_size;
 	size_t i;
 
 	one_page = memalign(pmd_pagesize, len);
@@ -107,8 +106,7 @@ void split_pmd_thp(void)
 	for (i = 0; i < len; i++)
 		one_page[i] = (char)i;
 
-	thp_size = check_huge(one_page);
-	if (!thp_size) {
+	if (!check_huge_anon(one_page, 1, pmd_pagesize)) {
 		printf("No THP is allocated\n");
 		exit(EXIT_FAILURE);
 	}
@@ -124,9 +122,8 @@ void split_pmd_thp(void)
 		}
 
 
-	thp_size = check_huge(one_page);
-	if (thp_size) {
-		printf("Still %ld kB AnonHugePages not split\n", thp_size);
+	if (check_huge_anon(one_page, 0, pmd_pagesize)) {
+		printf("Still AnonHugePages not split\n");
 		exit(EXIT_FAILURE);
 	}
 
@@ -172,8 +169,7 @@ void split_pte_mapped_thp(void)
 	for (i = 0; i < len; i++)
 		one_page[i] = (char)i;
 
-	thp_size = check_huge(one_page);
-	if (!thp_size) {
+	if (!check_huge_anon(one_page, 1, pmd_pagesize)) {
 		printf("No THP is allocated\n");
 		exit(EXIT_FAILURE);
 	}
diff --git a/tools/testing/selftests/vm/test_hmm.sh b/tools/testing/selftests/vm/test_hmm.sh
index 539c9371e592..46e19b5d648d 100755
--- a/tools/testing/selftests/vm/test_hmm.sh
+++ b/tools/testing/selftests/vm/test_hmm.sh
@@ -52,21 +52,11 @@ load_driver()
 			usage
 		fi
 	fi
-	if [ $? == 0 ]; then
-		major=$(awk "\$2==\"HMM_DMIRROR\" {print \$1}" /proc/devices)
-		mknod /dev/hmm_dmirror0 c $major 0
-		mknod /dev/hmm_dmirror1 c $major 1
-		if [ $# -eq 2 ]; then
-			mknod /dev/hmm_dmirror2 c $major 2
-			mknod /dev/hmm_dmirror3 c $major 3
-		fi
-	fi
 }
 
 unload_driver()
 {
 	modprobe -r $DRIVER > /dev/null 2>&1
-	rm -f /dev/hmm_dmirror?
 }
 
 run_smoke()
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 7c3f1b0ab468..297f250c1d95 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -61,10 +61,11 @@
 #include <sys/random.h>
 
 #include "../kselftest.h"
+#include "vm_util.h"
 
 #ifdef __NR_userfaultfd
 
-static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
+static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size;
 
 #define BOUNCE_RANDOM		(1<<0)
 #define BOUNCE_RACINGFAULTS	(1<<1)
@@ -77,6 +78,13 @@ static int bounces;
 #define TEST_SHMEM	3
 static int test_type;
 
+#define UFFD_FLAGS	(O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY)
+
+#define BASE_PMD_ADDR ((void *)(1UL << 30))
+
+/* test using /dev/userfaultfd, instead of userfaultfd(2) */
+static bool test_dev_userfaultfd;
+
 /* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */
 #define ALARM_INTERVAL_SECS 10
 static volatile bool test_uffdio_copy_eexist = true;
@@ -92,9 +100,10 @@ static int huge_fd;
 static unsigned long long *count_verify;
 static int uffd = -1;
 static int uffd_flags, finished, *pipefd;
-static char *area_src, *area_src_alias, *area_dst, *area_dst_alias;
+static char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap;
 static char *zeropage;
 pthread_attr_t attr;
+static bool test_collapse;
 
 /* Userfaultfd test statistics */
 struct uffd_stats {
@@ -122,9 +131,13 @@ struct uffd_stats {
 #define swap(a, b) \
 	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
 
+#define factor_of_2(x) ((x) ^ ((x) & ((x) - 1)))
+
 const char *examples =
     "# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
     "./userfaultfd anon 100 99999\n\n"
+    "# Run the same anonymous memory test, but using /dev/userfaultfd:\n"
+    "./userfaultfd anon:dev 100 99999\n\n"
     "# Run share memory test on 1GiB region with 99 bounces:\n"
     "./userfaultfd shmem 1000 99\n\n"
     "# Run hugetlb memory test on 256MiB region with 50 bounces:\n"
@@ -141,6 +154,16 @@ static void usage(void)
 		"[hugetlbfs_file]\n\n");
 	fprintf(stderr, "Supported <test type>: anon, hugetlb, "
 		"hugetlb_shared, shmem\n\n");
+	fprintf(stderr, "'Test mods' can be joined to the test type string with a ':'. "
+		"Supported mods:\n");
+	fprintf(stderr, "\tsyscall - Use userfaultfd(2) (default)\n");
+	fprintf(stderr, "\tdev - Use /dev/userfaultfd instead of userfaultfd(2)\n");
+	fprintf(stderr, "\tcollapse - Test MADV_COLLAPSE of UFFDIO_REGISTER_MODE_MINOR\n"
+		"memory\n");
+	fprintf(stderr, "\nExample test mod usage:\n");
+	fprintf(stderr, "# Run anonymous memory test with /dev/userfaultfd:\n");
+	fprintf(stderr, "./userfaultfd anon:dev 100 99999\n\n");
+
 	fprintf(stderr, "Examples:\n\n");
 	fprintf(stderr, "%s", examples);
 	exit(1);
@@ -154,12 +177,14 @@ static void usage(void)
 			ret, __LINE__);				\
 	} while (0)
 
-#define err(fmt, ...)				\
+#define errexit(exitcode, fmt, ...)		\
 	do {					\
 		_err(fmt, ##__VA_ARGS__);	\
-		exit(1);			\
+		exit(exitcode);			\
 	} while (0)
 
+#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__)
+
 static void uffd_stats_reset(struct uffd_stats *uffd_stats,
 			     unsigned long n_cpus)
 {
@@ -212,12 +237,10 @@ static void anon_release_pages(char *rel_area)
 		err("madvise(MADV_DONTNEED) failed");
 }
 
-static void anon_allocate_area(void **alloc_area)
+static void anon_allocate_area(void **alloc_area, bool is_src)
 {
 	*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
 			   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-	if (*alloc_area == MAP_FAILED)
-		err("mmap of anonymous memory failed");
 }
 
 static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
@@ -235,7 +258,7 @@ static void hugetlb_release_pages(char *rel_area)
 	}
 }
 
-static void hugetlb_allocate_area(void **alloc_area)
+static void hugetlb_allocate_area(void **alloc_area, bool is_src)
 {
 	void *area_alias = NULL;
 	char **alloc_area_alias;
@@ -245,7 +268,7 @@ static void hugetlb_allocate_area(void **alloc_area)
 			nr_pages * page_size,
 			PROT_READ | PROT_WRITE,
 			MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB |
-				(*alloc_area == area_src ? 0 : MAP_NORESERVE),
+				(is_src ? 0 : MAP_NORESERVE),
 			-1,
 			0);
 	else
@@ -253,9 +276,9 @@ static void hugetlb_allocate_area(void **alloc_area)
 			nr_pages * page_size,
 			PROT_READ | PROT_WRITE,
 			MAP_SHARED |
-				(*alloc_area == area_src ? 0 : MAP_NORESERVE),
+				(is_src ? 0 : MAP_NORESERVE),
 			huge_fd,
-			*alloc_area == area_src ? 0 : nr_pages * page_size);
+			is_src ? 0 : nr_pages * page_size);
 	if (*alloc_area == MAP_FAILED)
 		err("mmap of hugetlbfs file failed");
 
@@ -265,12 +288,12 @@ static void hugetlb_allocate_area(void **alloc_area)
 			PROT_READ | PROT_WRITE,
 			MAP_SHARED,
 			huge_fd,
-			*alloc_area == area_src ? 0 : nr_pages * page_size);
+			is_src ? 0 : nr_pages * page_size);
 		if (area_alias == MAP_FAILED)
 			err("mmap of hugetlb file alias failed");
 	}
 
-	if (*alloc_area == area_src) {
+	if (is_src) {
 		alloc_area_alias = &area_src_alias;
 	} else {
 		alloc_area_alias = &area_dst_alias;
@@ -293,21 +316,36 @@ static void shmem_release_pages(char *rel_area)
 		err("madvise(MADV_REMOVE) failed");
 }
 
-static void shmem_allocate_area(void **alloc_area)
+static void shmem_allocate_area(void **alloc_area, bool is_src)
 {
 	void *area_alias = NULL;
-	bool is_src = alloc_area == (void **)&area_src;
-	unsigned long offset = is_src ? 0 : nr_pages * page_size;
+	size_t bytes = nr_pages * page_size;
+	unsigned long offset = is_src ? 0 : bytes;
+	char *p = NULL, *p_alias = NULL;
+
+	if (test_collapse) {
+		p = BASE_PMD_ADDR;
+		if (!is_src)
+			/* src map + alias + interleaved hpages */
+			p += 2 * (bytes + hpage_size);
+		p_alias = p;
+		p_alias += bytes;
+		p_alias += hpage_size;  /* Prevent src/dst VMA merge */
+	}
 
-	*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
-			   MAP_SHARED, shm_fd, offset);
+	*alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
+			   shm_fd, offset);
 	if (*alloc_area == MAP_FAILED)
 		err("mmap of memfd failed");
+	if (test_collapse && *alloc_area != p)
+		err("mmap of memfd failed at %p", p);
 
-	area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
-			  MAP_SHARED, shm_fd, offset);
+	area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
+			  shm_fd, offset);
 	if (area_alias == MAP_FAILED)
 		err("mmap of memfd alias failed");
+	if (test_collapse && area_alias != p_alias)
+		err("mmap of anonymous memory failed at %p", p_alias);
 
 	if (is_src)
 		area_src_alias = area_alias;
@@ -320,28 +358,39 @@ static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
 	*start = (unsigned long)area_dst_alias + offset;
 }
 
+static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages)
+{
+	if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, hpage_size))
+		err("Did not find expected %d number of hugepages",
+		    expect_nr_hpages);
+}
+
 struct uffd_test_ops {
-	void (*allocate_area)(void **alloc_area);
+	void (*allocate_area)(void **alloc_area, bool is_src);
 	void (*release_pages)(char *rel_area);
 	void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
+	void (*check_pmd_mapping)(void *p, int expect_nr_hpages);
 };
 
 static struct uffd_test_ops anon_uffd_test_ops = {
 	.allocate_area	= anon_allocate_area,
 	.release_pages	= anon_release_pages,
 	.alias_mapping = noop_alias_mapping,
+	.check_pmd_mapping = NULL,
 };
 
 static struct uffd_test_ops shmem_uffd_test_ops = {
 	.allocate_area	= shmem_allocate_area,
 	.release_pages	= shmem_release_pages,
 	.alias_mapping = shmem_alias_mapping,
+	.check_pmd_mapping = shmem_check_pmd_mapping,
 };
 
 static struct uffd_test_ops hugetlb_uffd_test_ops = {
 	.allocate_area	= hugetlb_allocate_area,
 	.release_pages	= hugetlb_release_pages,
 	.alias_mapping = hugetlb_alias_mapping,
+	.check_pmd_mapping = NULL,
 };
 
 static struct uffd_test_ops *uffd_test_ops;
@@ -383,13 +432,34 @@ static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls)
 	}
 }
 
+static int __userfaultfd_open_dev(void)
+{
+	int fd, _uffd;
+
+	fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
+	if (fd < 0)
+		errexit(KSFT_SKIP, "opening /dev/userfaultfd failed");
+
+	_uffd = ioctl(fd, USERFAULTFD_IOC_NEW, UFFD_FLAGS);
+	if (_uffd < 0)
+		errexit(errno == ENOTTY ? KSFT_SKIP : 1,
+			"creating userfaultfd failed");
+	close(fd);
+	return _uffd;
+}
+
 static void userfaultfd_open(uint64_t *features)
 {
 	struct uffdio_api uffdio_api;
 
-	uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY);
-	if (uffd < 0)
-		err("userfaultfd syscall not available in this kernel");
+	if (test_dev_userfaultfd)
+		uffd = __userfaultfd_open_dev();
+	else {
+		uffd = syscall(__NR_userfaultfd, UFFD_FLAGS);
+		if (uffd < 0)
+			errexit(errno == ENOSYS ? KSFT_SKIP : 1,
+				"creating userfaultfd failed");
+	}
 	uffd_flags = fcntl(uffd, F_GETFD, NULL);
 
 	uffdio_api.api = UFFD_API;
@@ -440,6 +510,7 @@ static void uffd_test_ctx_clear(void)
 	munmap_area((void **)&area_src_alias);
 	munmap_area((void **)&area_dst);
 	munmap_area((void **)&area_dst_alias);
+	munmap_area((void **)&area_remap);
 }
 
 static void uffd_test_ctx_init(uint64_t features)
@@ -448,8 +519,8 @@ static void uffd_test_ctx_init(uint64_t features)
 
 	uffd_test_ctx_clear();
 
-	uffd_test_ops->allocate_area((void **)&area_src);
-	uffd_test_ops->allocate_area((void **)&area_dst);
+	uffd_test_ops->allocate_area((void **)&area_src, true);
+	uffd_test_ops->allocate_area((void **)&area_dst, false);
 
 	userfaultfd_open(&features);
 
@@ -703,7 +774,27 @@ static void uffd_handle_page_fault(struct uffd_msg *msg,
 		continue_range(uffd, msg->arg.pagefault.address, page_size);
 		stats->minor_faults++;
 	} else {
-		/* Missing page faults */
+		/*
+		 * Missing page faults.
+		 *
+		 * Here we force a write check for each of the missing mode
+		 * faults.  It's guaranteed because the only threads that
+		 * will trigger uffd faults are the locking threads, and
+		 * their first instruction to touch the missing page will
+		 * always be pthread_mutex_lock().
+		 *
+		 * Note that here we relied on an NPTL glibc impl detail to
+		 * always read the lock type at the entry of the lock op
+		 * (pthread_mutex_t.__data.__type, offset 0x10) before
+		 * doing any locking operations to guarantee that.  It's
+		 * actually not good to rely on this impl detail because
+		 * logically a pthread-compatible lib can implement the
+		 * locks without types and we can fail when linking with
+		 * them.  However since we used to find bugs with this
+		 * strict check we still keep it around.  Hopefully this
+		 * could be a good hint when it fails again.  If one day
+		 * it'll break on some other impl of glibc we'll revisit.
+		 */
 		if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
 			err("unexpected write fault");
 
@@ -766,6 +857,7 @@ static void *uffd_poll_thread(void *arg)
 				err("remove failure");
 			break;
 		case UFFD_EVENT_REMAP:
+			area_remap = area_dst;  /* save for later unmap */
 			area_dst = (char *)(unsigned long)msg.arg.remap.to;
 			break;
 		}
@@ -1218,13 +1310,30 @@ static int userfaultfd_sig_test(void)
 	return userfaults != 0;
 }
 
+void check_memory_contents(char *p)
+{
+	unsigned long i;
+	uint8_t expected_byte;
+	void *expected_page;
+
+	if (posix_memalign(&expected_page, page_size, page_size))
+		err("out of memory");
+
+	for (i = 0; i < nr_pages; ++i) {
+		expected_byte = ~((uint8_t)(i % ((uint8_t)-1)));
+		memset(expected_page, expected_byte, page_size);
+		if (my_bcmp(expected_page, p + (i * page_size), page_size))
+			err("unexpected page contents after minor fault");
+	}
+
+	free(expected_page);
+}
+
 static int userfaultfd_minor_test(void)
 {
-	struct uffdio_register uffdio_register;
 	unsigned long p;
+	struct uffdio_register uffdio_register;
 	pthread_t uffd_mon;
-	uint8_t expected_byte;
-	void *expected_page;
 	char c;
 	struct uffd_stats stats = { 0 };
 
@@ -1263,17 +1372,7 @@ static int userfaultfd_minor_test(void)
 	 * fault. uffd_poll_thread will resolve the fault by bit-flipping the
 	 * page's contents, and then issuing a CONTINUE ioctl.
 	 */
-
-	if (posix_memalign(&expected_page, page_size, page_size))
-		err("out of memory");
-
-	for (p = 0; p < nr_pages; ++p) {
-		expected_byte = ~((uint8_t)(p % ((uint8_t)-1)));
-		memset(expected_page, expected_byte, page_size);
-		if (my_bcmp(expected_page, area_dst_alias + (p * page_size),
-			    page_size))
-			err("unexpected page contents after minor fault");
-	}
+	check_memory_contents(area_dst_alias);
 
 	if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
 		err("pipe write");
@@ -1282,6 +1381,23 @@ static int userfaultfd_minor_test(void)
 
 	uffd_stats_report(&stats, 1);
 
+	if (test_collapse) {
+		printf("testing collapse of uffd memory into PMD-mapped THPs:");
+		if (madvise(area_dst_alias, nr_pages * page_size,
+			    MADV_COLLAPSE))
+			err("madvise(MADV_COLLAPSE)");
+
+		uffd_test_ops->check_pmd_mapping(area_dst,
+						 nr_pages * page_size /
+						 hpage_size);
+		/*
+		 * This won't cause uffd-fault - it purely just makes sure there
+		 * was no corruption.
+		 */
+		check_memory_contents(area_dst_alias);
+		printf(" done.\n");
+	}
+
 	return stats.missing_faults != 0 || stats.minor_faults != nr_pages;
 }
 
@@ -1584,8 +1700,6 @@ unsigned long default_huge_page_size(void)
 
 static void set_test_type(const char *type)
 {
-	uint64_t features = UFFD_API_FEATURES;
-
 	if (!strcmp(type, "anon")) {
 		test_type = TEST_ANON;
 		uffd_test_ops = &anon_uffd_test_ops;
@@ -1603,12 +1717,37 @@ static void set_test_type(const char *type)
 		test_type = TEST_SHMEM;
 		uffd_test_ops = &shmem_uffd_test_ops;
 		test_uffdio_minor = true;
-	} else {
-		err("Unknown test type: %s", type);
 	}
+}
+
+static void parse_test_type_arg(const char *raw_type)
+{
+	char *buf = strdup(raw_type);
+	uint64_t features = UFFD_API_FEATURES;
+
+	while (buf) {
+		const char *token = strsep(&buf, ":");
+
+		if (!test_type)
+			set_test_type(token);
+		else if (!strcmp(token, "dev"))
+			test_dev_userfaultfd = true;
+		else if (!strcmp(token, "syscall"))
+			test_dev_userfaultfd = false;
+		else if (!strcmp(token, "collapse"))
+			test_collapse = true;
+		else
+			err("unrecognized test mod '%s'", token);
+	}
+
+	if (!test_type)
+		err("failed to parse test type argument: '%s'", raw_type);
+
+	if (test_collapse && test_type != TEST_SHMEM)
+		err("Unsupported test: %s", raw_type);
 
 	if (test_type == TEST_HUGETLB)
-		page_size = default_huge_page_size();
+		page_size = hpage_size;
 	else
 		page_size = sysconf(_SC_PAGE_SIZE);
 
@@ -1646,6 +1785,8 @@ static void sigalrm(int sig)
 
 int main(int argc, char **argv)
 {
+	size_t bytes;
+
 	if (argc < 4)
 		usage();
 
@@ -1653,11 +1794,41 @@ int main(int argc, char **argv)
 		err("failed to arm SIGALRM");
 	alarm(ALARM_INTERVAL_SECS);
 
-	set_test_type(argv[1]);
+	hpage_size = default_huge_page_size();
+	parse_test_type_arg(argv[1]);
+	bytes = atol(argv[2]) * 1024 * 1024;
+
+	if (test_collapse && bytes & (hpage_size - 1))
+		err("MiB must be multiple of %lu if :collapse mod set",
+		    hpage_size >> 20);
 
 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
-	nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size /
-		nr_cpus;
+
+	if (test_collapse) {
+		/* nr_cpus must divide (bytes / page_size), otherwise,
+		 * area allocations of (nr_pages * paze_size) won't be a
+		 * multiple of hpage_size, even if bytes is a multiple of
+		 * hpage_size.
+		 *
+		 * This means that nr_cpus must divide (N * (2 << (H-P))
+		 * where:
+		 *	bytes = hpage_size * N
+		 *	hpage_size = 2 << H
+		 *	page_size = 2 << P
+		 *
+		 * And we want to chose nr_cpus to be the largest value
+		 * satisfying this constraint, not larger than the number
+		 * of online CPUs. Unfortunately, prime factorization of
+		 * N and nr_cpus may be arbitrary, so have to search for it.
+		 * Instead, just use the highest power of 2 dividing both
+		 * nr_cpus and (bytes / page_size).
+		 */
+		int x = factor_of_2(nr_cpus);
+		int y = factor_of_2(bytes / page_size);
+
+		nr_cpus = x < y ? x : y;
+	}
+	nr_pages_per_cpu = bytes / page_size / nr_cpus;
 	if (!nr_pages_per_cpu) {
 		_err("invalid MiB");
 		usage();
diff --git a/tools/testing/selftests/vm/vm_util.c b/tools/testing/selftests/vm/vm_util.c
index b58ab11a7a30..f11f8adda521 100644
--- a/tools/testing/selftests/vm/vm_util.c
+++ b/tools/testing/selftests/vm/vm_util.c
@@ -42,9 +42,9 @@ void clear_softdirty(void)
 		ksft_exit_fail_msg("writing clear_refs failed\n");
 }
 
-static bool check_for_pattern(FILE *fp, const char *pattern, char *buf)
+bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len)
 {
-	while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) {
+	while (fgets(buf, len, fp)) {
 		if (!strncmp(buf, pattern, strlen(pattern)))
 			return true;
 	}
@@ -72,9 +72,10 @@ uint64_t read_pmd_pagesize(void)
 	return strtoul(buf, NULL, 10);
 }
 
-uint64_t check_huge(void *addr)
+bool __check_huge(void *addr, char *pattern, int nr_hpages,
+		  uint64_t hpage_size)
 {
-	uint64_t thp = 0;
+	uint64_t thp = -1;
 	int ret;
 	FILE *fp;
 	char buffer[MAX_LINE_LENGTH];
@@ -89,20 +90,37 @@ uint64_t check_huge(void *addr)
 	if (!fp)
 		ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, SMAP_FILE_PATH);
 
-	if (!check_for_pattern(fp, addr_pattern, buffer))
+	if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer)))
 		goto err_out;
 
 	/*
-	 * Fetch the AnonHugePages: in the same block and check the number of
+	 * Fetch the pattern in the same block and check the number of
 	 * hugepages.
 	 */
-	if (!check_for_pattern(fp, "AnonHugePages:", buffer))
+	if (!check_for_pattern(fp, pattern, buffer, sizeof(buffer)))
 		goto err_out;
 
-	if (sscanf(buffer, "AnonHugePages:%10ld kB", &thp) != 1)
+	snprintf(addr_pattern, MAX_LINE_LENGTH, "%s%%9ld kB", pattern);
+
+	if (sscanf(buffer, addr_pattern, &thp) != 1)
 		ksft_exit_fail_msg("Reading smap error\n");
 
 err_out:
 	fclose(fp);
-	return thp;
+	return thp == (nr_hpages * (hpage_size >> 10));
+}
+
+bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size)
+{
+	return __check_huge(addr, "AnonHugePages: ", nr_hpages, hpage_size);
+}
+
+bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size)
+{
+	return __check_huge(addr, "FilePmdMapped:", nr_hpages, hpage_size);
+}
+
+bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size)
+{
+	return __check_huge(addr, "ShmemPmdMapped:", nr_hpages, hpage_size);
 }
diff --git a/tools/testing/selftests/vm/vm_util.h b/tools/testing/selftests/vm/vm_util.h
index 2e512bd57ae1..5c35de454e08 100644
--- a/tools/testing/selftests/vm/vm_util.h
+++ b/tools/testing/selftests/vm/vm_util.h
@@ -5,5 +5,8 @@
 uint64_t pagemap_get_entry(int fd, char *start);
 bool pagemap_is_softdirty(int fd, char *start);
 void clear_softdirty(void);
+bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len);
 uint64_t read_pmd_pagesize(void);
-uint64_t check_huge(void *addr);
+bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size);
+bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size);
+bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size);