diff options
Diffstat (limited to 'tools/testing/selftests')
64 files changed, 4043 insertions, 770 deletions
diff --git a/tools/testing/selftests/cgroup/.gitignore b/tools/testing/selftests/cgroup/.gitignore index 306ee1b01e72..c4a57e69f749 100644 --- a/tools/testing/selftests/cgroup/.gitignore +++ b/tools/testing/selftests/cgroup/.gitignore @@ -5,3 +5,4 @@ test_freezer test_kmem test_kill test_cpu +wait_inotify diff --git a/tools/testing/selftests/cgroup/Makefile b/tools/testing/selftests/cgroup/Makefile index 478217cc1371..3d263747d2ad 100644 --- a/tools/testing/selftests/cgroup/Makefile +++ b/tools/testing/selftests/cgroup/Makefile @@ -1,10 +1,11 @@ # SPDX-License-Identifier: GPL-2.0 CFLAGS += -Wall -pthread -all: +all: ${HELPER_PROGS} TEST_FILES := with_stress.sh -TEST_PROGS := test_stress.sh +TEST_PROGS := test_stress.sh test_cpuset_prs.sh +TEST_GEN_FILES := wait_inotify TEST_GEN_PROGS = test_memcontrol TEST_GEN_PROGS += test_kmem TEST_GEN_PROGS += test_core diff --git a/tools/testing/selftests/cgroup/config b/tools/testing/selftests/cgroup/config index 84fe884fad86..97d549ee894f 100644 --- a/tools/testing/selftests/cgroup/config +++ b/tools/testing/selftests/cgroup/config @@ -4,5 +4,4 @@ CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_SCHED=y CONFIG_MEMCG=y CONFIG_MEMCG_KMEM=y -CONFIG_MEMCG_SWAP=y CONFIG_PAGE_COUNTER=y diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh new file mode 100755 index 000000000000..526d2c42d870 --- /dev/null +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh @@ -0,0 +1,674 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test for cpuset v2 partition root state (PRS) +# +# The sched verbose flag is set, if available, so that the console log +# can be examined for the correct setting of scheduling domain. +# + +skip_test() { + echo "$1" + echo "Test SKIPPED" + exit 0 +} + +[[ $(id -u) -eq 0 ]] || skip_test "Test must be run as root!" + +# Set sched verbose flag, if available +[[ -d /sys/kernel/debug/sched ]] && echo Y > /sys/kernel/debug/sched/verbose + +# Get wait_inotify location +WAIT_INOTIFY=$(cd $(dirname $0); pwd)/wait_inotify + +# Find cgroup v2 mount point +CGROUP2=$(mount -t cgroup2 | head -1 | awk -e '{print $3}') +[[ -n "$CGROUP2" ]] || skip_test "Cgroup v2 mount point not found!" + +CPUS=$(lscpu | grep "^CPU(s)" | sed -e "s/.*:[[:space:]]*//") +[[ $CPUS -lt 8 ]] && skip_test "Test needs at least 8 cpus available!" + +# Set verbose flag and delay factor +PROG=$1 +VERBOSE= +DELAY_FACTOR=1 +while [[ "$1" = -* ]] +do + case "$1" in + -v) VERBOSE=1 + break + ;; + -d) DELAY_FACTOR=$2 + shift + break + ;; + *) echo "Usage: $PROG [-v] [-d <delay-factor>" + exit + ;; + esac + shift +done + +cd $CGROUP2 +echo +cpuset > cgroup.subtree_control +[[ -d test ]] || mkdir test +cd test + +# Pause in ms +pause() +{ + DELAY=$1 + LOOP=0 + while [[ $LOOP -lt $DELAY_FACTOR ]] + do + sleep $DELAY + ((LOOP++)) + done + return 0 +} + +console_msg() +{ + MSG=$1 + echo "$MSG" + echo "" > /dev/console + echo "$MSG" > /dev/console + pause 0.01 +} + +test_partition() +{ + EXPECTED_VAL=$1 + echo $EXPECTED_VAL > cpuset.cpus.partition + [[ $? -eq 0 ]] || exit 1 + ACTUAL_VAL=$(cat cpuset.cpus.partition) + [[ $ACTUAL_VAL != $EXPECTED_VAL ]] && { + echo "cpuset.cpus.partition: expect $EXPECTED_VAL, found $EXPECTED_VAL" + echo "Test FAILED" + exit 1 + } +} + +test_effective_cpus() +{ + EXPECTED_VAL=$1 + ACTUAL_VAL=$(cat cpuset.cpus.effective) + [[ "$ACTUAL_VAL" != "$EXPECTED_VAL" ]] && { + echo "cpuset.cpus.effective: expect '$EXPECTED_VAL', found '$EXPECTED_VAL'" + echo "Test FAILED" + exit 1 + } +} + +# Adding current process to cgroup.procs as a test +test_add_proc() +{ + OUTSTR="$1" + ERRMSG=$((echo $$ > cgroup.procs) |& cat) + echo $ERRMSG | grep -q "$OUTSTR" + [[ $? -ne 0 ]] && { + echo "cgroup.procs: expect '$OUTSTR', got '$ERRMSG'" + echo "Test FAILED" + exit 1 + } + echo $$ > $CGROUP2/cgroup.procs # Move out the task +} + +# +# Testing the new "isolated" partition root type +# +test_isolated() +{ + echo 2-3 > cpuset.cpus + TYPE=$(cat cpuset.cpus.partition) + [[ $TYPE = member ]] || echo member > cpuset.cpus.partition + + console_msg "Change from member to root" + test_partition root + + console_msg "Change from root to isolated" + test_partition isolated + + console_msg "Change from isolated to member" + test_partition member + + console_msg "Change from member to isolated" + test_partition isolated + + console_msg "Change from isolated to root" + test_partition root + + console_msg "Change from root to member" + test_partition member + + # + # Testing partition root with no cpu + # + console_msg "Distribute all cpus to child partition" + echo +cpuset > cgroup.subtree_control + test_partition root + + mkdir A1 + cd A1 + echo 2-3 > cpuset.cpus + test_partition root + test_effective_cpus 2-3 + cd .. + test_effective_cpus "" + + console_msg "Moving task to partition test" + test_add_proc "No space left" + cd A1 + test_add_proc "" + cd .. + + console_msg "Shrink and expand child partition" + cd A1 + echo 2 > cpuset.cpus + cd .. + test_effective_cpus 3 + cd A1 + echo 2-3 > cpuset.cpus + cd .. + test_effective_cpus "" + + # Cleaning up + console_msg "Cleaning up" + echo $$ > $CGROUP2/cgroup.procs + [[ -d A1 ]] && rmdir A1 +} + +# +# Cpuset controller state transition test matrix. +# +# Cgroup test hierarchy +# +# test -- A1 -- A2 -- A3 +# \- B1 +# +# P<v> = set cpus.partition (0:member, 1:root, 2:isolated, -1:root invalid) +# C<l> = add cpu-list +# S<p> = use prefix in subtree_control +# T = put a task into cgroup +# O<c>-<v> = Write <v> to CPU online file of <c> +# +SETUP_A123_PARTITIONS="C1-3:P1:S+ C2-3:P1:S+ C3:P1" +TEST_MATRIX=( + # test old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate + # ---- ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ + " S+ C0-1 . . C2-3 S+ C4-5 . . 0 A2:0-1" + " S+ C0-1 . . C2-3 P1 . . . 0 " + " S+ C0-1 . . C2-3 P1:S+ C0-1:P1 . . 0 " + " S+ C0-1 . . C2-3 P1:S+ C1:P1 . . 0 " + " S+ C0-1:S+ . . C2-3 . . . P1 0 " + " S+ C0-1:P1 . . C2-3 S+ C1 . . 0 " + " S+ C0-1:P1 . . C2-3 S+ C1:P1 . . 0 " + " S+ C0-1:P1 . . C2-3 S+ C1:P1 . P1 0 " + " S+ C0-1:P1 . . C2-3 C4-5 . . . 0 A1:4-5" + " S+ C0-1:P1 . . C2-3 S+:C4-5 . . . 0 A1:4-5" + " S+ C0-1 . . C2-3:P1 . . . C2 0 " + " S+ C0-1 . . C2-3:P1 . . . C4-5 0 B1:4-5" + " S+ C0-3:P1:S+ C2-3:P1 . . . . . . 0 A1:0-1,A2:2-3" + " S+ C0-3:P1:S+ C2-3:P1 . . C1-3 . . . 0 A1:1,A2:2-3" + " S+ C2-3:P1:S+ C3:P1 . . C3 . . . 0 A1:,A2:3 A1:P1,A2:P1" + " S+ C2-3:P1:S+ C3:P1 . . C3 P0 . . 0 A1:3,A2:3 A1:P1,A2:P0" + " S+ C2-3:P1:S+ C2:P1 . . C2-4 . . . 0 A1:3-4,A2:2" + " S+ C2-3:P1:S+ C3:P1 . . C3 . . C0-2 0 A1:,B1:0-2 A1:P1,A2:P1" + " S+ $SETUP_A123_PARTITIONS . C2-3 . . . 0 A1:,A2:2,A3:3 A1:P1,A2:P1,A3:P1" + + # CPU offlining cases: + " S+ C0-1 . . C2-3 S+ C4-5 . O2-0 0 A1:0-1,B1:3" + " S+ C0-3:P1:S+ C2-3:P1 . . O2-0 . . . 0 A1:0-1,A2:3" + " S+ C0-3:P1:S+ C2-3:P1 . . O2-0 O2-1 . . 0 A1:0-1,A2:2-3" + " S+ C0-3:P1:S+ C2-3:P1 . . O1-0 . . . 0 A1:0,A2:2-3" + " S+ C0-3:P1:S+ C2-3:P1 . . O1-0 O1-1 . . 0 A1:0-1,A2:2-3" + " S+ C2-3:P1:S+ C3:P1 . . O3-0 O3-1 . . 0 A1:2,A2:3 A1:P1,A2:P1" + " S+ C2-3:P1:S+ C3:P2 . . O3-0 O3-1 . . 0 A1:2,A2:3 A1:P1,A2:P2" + " S+ C2-3:P1:S+ C3:P1 . . O2-0 O2-1 . . 0 A1:2,A2:3 A1:P1,A2:P1" + " S+ C2-3:P1:S+ C3:P2 . . O2-0 O2-1 . . 0 A1:2,A2:3 A1:P1,A2:P2" + " S+ C2-3:P1:S+ C3:P1 . . O2-0 . . . 0 A1:,A2:3 A1:P1,A2:P1" + " S+ C2-3:P1:S+ C3:P1 . . O3-0 . . . 0 A1:2,A2: A1:P1,A2:P1" + " S+ C2-3:P1:S+ C3:P1 . . T:O2-0 . . . 0 A1:3,A2:3 A1:P1,A2:P-1" + " S+ C2-3:P1:S+ C3:P1 . . . T:O3-0 . . 0 A1:2,A2:2 A1:P1,A2:P-1" + " S+ $SETUP_A123_PARTITIONS . O1-0 . . . 0 A1:,A2:2,A3:3 A1:P1,A2:P1,A3:P1" + " S+ $SETUP_A123_PARTITIONS . O2-0 . . . 0 A1:1,A2:,A3:3 A1:P1,A2:P1,A3:P1" + " S+ $SETUP_A123_PARTITIONS . O3-0 . . . 0 A1:1,A2:2,A3: A1:P1,A2:P1,A3:P1" + " S+ $SETUP_A123_PARTITIONS . T:O1-0 . . . 0 A1:2-3,A2:2-3,A3:3 A1:P1,A2:P-1,A3:P-1" + " S+ $SETUP_A123_PARTITIONS . . T:O2-0 . . 0 A1:1,A2:3,A3:3 A1:P1,A2:P1,A3:P-1" + " S+ $SETUP_A123_PARTITIONS . . . T:O3-0 . 0 A1:1,A2:2,A3:2 A1:P1,A2:P1,A3:P-1" + " S+ $SETUP_A123_PARTITIONS . T:O1-0 O1-1 . . 0 A1:1,A2:2,A3:3 A1:P1,A2:P1,A3:P1" + " S+ $SETUP_A123_PARTITIONS . . T:O2-0 O2-1 . 0 A1:1,A2:2,A3:3 A1:P1,A2:P1,A3:P1" + " S+ $SETUP_A123_PARTITIONS . . . T:O3-0 O3-1 0 A1:1,A2:2,A3:3 A1:P1,A2:P1,A3:P1" + " S+ $SETUP_A123_PARTITIONS . T:O1-0 O2-0 O1-1 . 0 A1:1,A2:,A3:3 A1:P1,A2:P1,A3:P1" + " S+ $SETUP_A123_PARTITIONS . T:O1-0 O2-0 O2-1 . 0 A1:2-3,A2:2-3,A3:3 A1:P1,A2:P-1,A3:P-1" + + # test old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate + # ---- ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ + # + # Incorrect change to cpuset.cpus invalidates partition root + # + # Adding CPUs to partition root that are not in parent's + # cpuset.cpus is allowed, but those extra CPUs are ignored. + " S+ C2-3:P1:S+ C3:P1 . . . C2-4 . . 0 A1:,A2:2-3 A1:P1,A2:P1" + + # Taking away all CPUs from parent or itself if there are tasks + # will make the partition invalid. + " S+ C2-3:P1:S+ C3:P1 . . T C2-3 . . 0 A1:2-3,A2:2-3 A1:P1,A2:P-1" + " S+ $SETUP_A123_PARTITIONS . T:C2-3 . . . 0 A1:2-3,A2:2-3,A3:3 A1:P1,A2:P-1,A3:P-1" + " S+ $SETUP_A123_PARTITIONS . T:C2-3:C1-3 . . . 0 A1:1,A2:2,A3:3 A1:P1,A2:P1,A3:P1" + + # Changing a partition root to member makes child partitions invalid + " S+ C2-3:P1:S+ C3:P1 . . P0 . . . 0 A1:2-3,A2:3 A1:P0,A2:P-1" + " S+ $SETUP_A123_PARTITIONS . C2-3 P0 . . 0 A1:2-3,A2:2-3,A3:3 A1:P1,A2:P0,A3:P-1" + + # cpuset.cpus can contains cpus not in parent's cpuset.cpus as long + # as they overlap. + " S+ C2-3:P1:S+ . . . . C3-4:P1 . . 0 A1:2,A2:3 A1:P1,A2:P1" + + # Deletion of CPUs distributed to child cgroup is allowed. + " S+ C0-1:P1:S+ C1 . C2-3 C4-5 . . . 0 A1:4-5,A2:4-5" + + # To become a valid partition root, cpuset.cpus must overlap parent's + # cpuset.cpus. + " S+ C0-1:P1 . . C2-3 S+ C4-5:P1 . . 0 A1:0-1,A2:0-1 A1:P1,A2:P-1" + + # Enabling partition with child cpusets is allowed + " S+ C0-1:S+ C1 . C2-3 P1 . . . 0 A1:0-1,A2:1 A1:P1" + + # A partition root with non-partition root parent is invalid, but it + # can be made valid if its parent becomes a partition root too. + " S+ C0-1:S+ C1 . C2-3 . P2 . . 0 A1:0-1,A2:1 A1:P0,A2:P-2" + " S+ C0-1:S+ C1:P2 . C2-3 P1 . . . 0 A1:0,A2:1 A1:P1,A2:P2" + + # A non-exclusive cpuset.cpus change will invalidate partition and its siblings + " S+ C0-1:P1 . . C2-3 C0-2 . . . 0 A1:0-2,B1:2-3 A1:P-1,B1:P0" + " S+ C0-1:P1 . . P1:C2-3 C0-2 . . . 0 A1:0-2,B1:2-3 A1:P-1,B1:P-1" + " S+ C0-1 . . P1:C2-3 C0-2 . . . 0 A1:0-2,B1:2-3 A1:P0,B1:P-1" + + # test old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate + # ---- ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ + # Failure cases: + + # A task cannot be added to a partition with no cpu + " S+ C2-3:P1:S+ C3:P1 . . O2-0:T . . . 1 A1:,A2:3 A1:P1,A2:P1" +) + +# +# Write to the cpu online file +# $1 - <c>-<v> where <c> = cpu number, <v> value to be written +# +write_cpu_online() +{ + CPU=${1%-*} + VAL=${1#*-} + CPUFILE=//sys/devices/system/cpu/cpu${CPU}/online + if [[ $VAL -eq 0 ]] + then + OFFLINE_CPUS="$OFFLINE_CPUS $CPU" + else + [[ -n "$OFFLINE_CPUS" ]] && { + OFFLINE_CPUS=$(echo $CPU $CPU $OFFLINE_CPUS | fmt -1 |\ + sort | uniq -u) + } + fi + echo $VAL > $CPUFILE + pause 0.01 +} + +# +# Set controller state +# $1 - cgroup directory +# $2 - state +# $3 - showerr +# +# The presence of ":" in state means transition from one to the next. +# +set_ctrl_state() +{ + TMPMSG=/tmp/.msg_$$ + CGRP=$1 + STATE=$2 + SHOWERR=${3}${VERBOSE} + CTRL=${CTRL:=$CONTROLLER} + HASERR=0 + REDIRECT="2> $TMPMSG" + [[ -z "$STATE" || "$STATE" = '.' ]] && return 0 + + rm -f $TMPMSG + for CMD in $(echo $STATE | sed -e "s/:/ /g") + do + TFILE=$CGRP/cgroup.procs + SFILE=$CGRP/cgroup.subtree_control + PFILE=$CGRP/cpuset.cpus.partition + CFILE=$CGRP/cpuset.cpus + S=$(expr substr $CMD 1 1) + if [[ $S = S ]] + then + PREFIX=${CMD#?} + COMM="echo ${PREFIX}${CTRL} > $SFILE" + eval $COMM $REDIRECT + elif [[ $S = C ]] + then + CPUS=${CMD#?} + COMM="echo $CPUS > $CFILE" + eval $COMM $REDIRECT + elif [[ $S = P ]] + then + VAL=${CMD#?} + case $VAL in + 0) VAL=member + ;; + 1) VAL=root + ;; + 2) VAL=isolated + ;; + *) + echo "Invalid partition state - $VAL" + exit 1 + ;; + esac + COMM="echo $VAL > $PFILE" + eval $COMM $REDIRECT + elif [[ $S = O ]] + then + VAL=${CMD#?} + write_cpu_online $VAL + elif [[ $S = T ]] + then + COMM="echo 0 > $TFILE" + eval $COMM $REDIRECT + fi + RET=$? + [[ $RET -ne 0 ]] && { + [[ -n "$SHOWERR" ]] && { + echo "$COMM" + cat $TMPMSG + } + HASERR=1 + } + pause 0.01 + rm -f $TMPMSG + done + return $HASERR +} + +set_ctrl_state_noerr() +{ + CGRP=$1 + STATE=$2 + [[ -d $CGRP ]] || mkdir $CGRP + set_ctrl_state $CGRP $STATE 1 + [[ $? -ne 0 ]] && { + echo "ERROR: Failed to set $2 to cgroup $1!" + exit 1 + } +} + +online_cpus() +{ + [[ -n "OFFLINE_CPUS" ]] && { + for C in $OFFLINE_CPUS + do + write_cpu_online ${C}-1 + done + } +} + +# +# Return 1 if the list of effective cpus isn't the same as the initial list. +# +reset_cgroup_states() +{ + echo 0 > $CGROUP2/cgroup.procs + online_cpus + rmdir A1/A2/A3 A1/A2 A1 B1 > /dev/null 2>&1 + set_ctrl_state . S- + pause 0.01 +} + +dump_states() +{ + for DIR in A1 A1/A2 A1/A2/A3 B1 + do + ECPUS=$DIR/cpuset.cpus.effective + PRS=$DIR/cpuset.cpus.partition + [[ -e $ECPUS ]] && echo "$ECPUS: $(cat $ECPUS)" + [[ -e $PRS ]] && echo "$PRS: $(cat $PRS)" + done +} + +# +# Check effective cpus +# $1 - check string, format: <cgroup>:<cpu-list>[,<cgroup>:<cpu-list>]* +# +check_effective_cpus() +{ + CHK_STR=$1 + for CHK in $(echo $CHK_STR | sed -e "s/,/ /g") + do + set -- $(echo $CHK | sed -e "s/:/ /g") + CGRP=$1 + CPUS=$2 + [[ $CGRP = A2 ]] && CGRP=A1/A2 + [[ $CGRP = A3 ]] && CGRP=A1/A2/A3 + FILE=$CGRP/cpuset.cpus.effective + [[ -e $FILE ]] || return 1 + [[ $CPUS = $(cat $FILE) ]] || return 1 + done +} + +# +# Check cgroup states +# $1 - check string, format: <cgroup>:<state>[,<cgroup>:<state>]* +# +check_cgroup_states() +{ + CHK_STR=$1 + for CHK in $(echo $CHK_STR | sed -e "s/,/ /g") + do + set -- $(echo $CHK | sed -e "s/:/ /g") + CGRP=$1 + STATE=$2 + FILE= + EVAL=$(expr substr $STATE 2 2) + [[ $CGRP = A2 ]] && CGRP=A1/A2 + [[ $CGRP = A3 ]] && CGRP=A1/A2/A3 + + case $STATE in + P*) FILE=$CGRP/cpuset.cpus.partition + ;; + *) echo "Unknown state: $STATE!" + exit 1 + ;; + esac + VAL=$(cat $FILE) + + case "$VAL" in + member) VAL=0 + ;; + root) VAL=1 + ;; + isolated) + VAL=2 + ;; + "root invalid"*) + VAL=-1 + ;; + "isolated invalid"*) + VAL=-2 + ;; + esac + [[ $EVAL != $VAL ]] && return 1 + done + return 0 +} + +# +# Run cpuset state transition test +# $1 - test matrix name +# +# This test is somewhat fragile as delays (sleep x) are added in various +# places to make sure state changes are fully propagated before the next +# action. These delays may need to be adjusted if running in a slower machine. +# +run_state_test() +{ + TEST=$1 + CONTROLLER=cpuset + CPULIST=0-6 + I=0 + eval CNT="\${#$TEST[@]}" + + reset_cgroup_states + echo $CPULIST > cpuset.cpus + echo root > cpuset.cpus.partition + console_msg "Running state transition test ..." + + while [[ $I -lt $CNT ]] + do + echo "Running test $I ..." > /dev/console + eval set -- "\${$TEST[$I]}" + ROOT=$1 + OLD_A1=$2 + OLD_A2=$3 + OLD_A3=$4 + OLD_B1=$5 + NEW_A1=$6 + NEW_A2=$7 + NEW_A3=$8 + NEW_B1=$9 + RESULT=${10} + ECPUS=${11} + STATES=${12} + + set_ctrl_state_noerr . $ROOT + set_ctrl_state_noerr A1 $OLD_A1 + set_ctrl_state_noerr A1/A2 $OLD_A2 + set_ctrl_state_noerr A1/A2/A3 $OLD_A3 + set_ctrl_state_noerr B1 $OLD_B1 + RETVAL=0 + set_ctrl_state A1 $NEW_A1; ((RETVAL += $?)) + set_ctrl_state A1/A2 $NEW_A2; ((RETVAL += $?)) + set_ctrl_state A1/A2/A3 $NEW_A3; ((RETVAL += $?)) + set_ctrl_state B1 $NEW_B1; ((RETVAL += $?)) + + [[ $RETVAL -ne $RESULT ]] && { + echo "Test $TEST[$I] failed result check!" + eval echo \"\${$TEST[$I]}\" + dump_states + online_cpus + exit 1 + } + + [[ -n "$ECPUS" && "$ECPUS" != . ]] && { + check_effective_cpus $ECPUS + [[ $? -ne 0 ]] && { + echo "Test $TEST[$I] failed effective CPU check!" + eval echo \"\${$TEST[$I]}\" + echo + dump_states + online_cpus + exit 1 + } + } + + [[ -n "$STATES" ]] && { + check_cgroup_states $STATES + [[ $? -ne 0 ]] && { + echo "FAILED: Test $TEST[$I] failed states check!" + eval echo \"\${$TEST[$I]}\" + echo + dump_states + online_cpus + exit 1 + } + } + + reset_cgroup_states + # + # Check to see if effective cpu list changes + # + pause 0.05 + NEWLIST=$(cat cpuset.cpus.effective) + [[ $NEWLIST != $CPULIST ]] && { + echo "Effective cpus changed to $NEWLIST after test $I!" + exit 1 + } + [[ -n "$VERBOSE" ]] && echo "Test $I done." + ((I++)) + done + echo "All $I tests of $TEST PASSED." + + echo member > cpuset.cpus.partition +} + +# +# Wait for inotify event for the given file and read it +# $1: cgroup file to wait for +# $2: file to store the read result +# +wait_inotify() +{ + CGROUP_FILE=$1 + OUTPUT_FILE=$2 + + $WAIT_INOTIFY $CGROUP_FILE + cat $CGROUP_FILE > $OUTPUT_FILE +} + +# +# Test if inotify events are properly generated when going into and out of +# invalid partition state. +# +test_inotify() +{ + ERR=0 + PRS=/tmp/.prs_$$ + [[ -f $WAIT_INOTIFY ]] || { + echo "wait_inotify not found, inotify test SKIPPED." + return + } + + pause 0.01 + echo 1 > cpuset.cpus + echo 0 > cgroup.procs + echo root > cpuset.cpus.partition + pause 0.01 + rm -f $PRS + wait_inotify $PWD/cpuset.cpus.partition $PRS & + pause 0.01 + set_ctrl_state . "O1-0" + pause 0.01 + check_cgroup_states ".:P-1" + if [[ $? -ne 0 ]] + then + echo "FAILED: Inotify test - partition not invalid" + ERR=1 + elif [[ ! -f $PRS ]] + then + echo "FAILED: Inotify test - event not generated" + ERR=1 + kill %1 + elif [[ $(cat $PRS) != "root invalid"* ]] + then + echo "FAILED: Inotify test - incorrect state" + cat $PRS + ERR=1 + fi + online_cpus + echo member > cpuset.cpus.partition + echo 0 > ../cgroup.procs + if [[ $ERR -ne 0 ]] + then + exit 1 + else + echo "Inotify test PASSED" + fi +} + +run_state_test TEST_MATRIX +test_isolated +test_inotify +echo "All tests PASSED." +cd .. +rmdir test diff --git a/tools/testing/selftests/cgroup/wait_inotify.c b/tools/testing/selftests/cgroup/wait_inotify.c new file mode 100644 index 000000000000..e11b431e1b62 --- /dev/null +++ b/tools/testing/selftests/cgroup/wait_inotify.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Wait until an inotify event on the given cgroup file. + */ +#include <linux/limits.h> +#include <sys/inotify.h> +#include <sys/mman.h> +#include <sys/ptrace.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <errno.h> +#include <fcntl.h> +#include <poll.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +static const char usage[] = "Usage: %s [-v] <cgroup_file>\n"; +static char *file; +static int verbose; + +static inline void fail_message(char *msg) +{ + fprintf(stderr, msg, file); + exit(1); +} + +int main(int argc, char *argv[]) +{ + char *cmd = argv[0]; + int c, fd; + struct pollfd fds = { .events = POLLIN, }; + + while ((c = getopt(argc, argv, "v")) != -1) { + switch (c) { + case 'v': + verbose++; + break; + } + argv++, argc--; + } + + if (argc != 2) { + fprintf(stderr, usage, cmd); + return -1; + } + file = argv[1]; + fd = open(file, O_RDONLY); + if (fd < 0) + fail_message("Cgroup file %s not found!\n"); + close(fd); + + fd = inotify_init(); + if (fd < 0) + fail_message("inotify_init() fails on %s!\n"); + if (inotify_add_watch(fd, file, IN_MODIFY) < 0) + fail_message("inotify_add_watch() fails on %s!\n"); + fds.fd = fd; + + /* + * poll waiting loop + */ + for (;;) { + int ret = poll(&fds, 1, 10000); + + if (ret < 0) { + if (errno == EINTR) + continue; + perror("poll"); + exit(1); + } + if ((ret > 0) && (fds.revents & POLLIN)) + break; + } + if (verbose) { + struct inotify_event events[10]; + long len; + + usleep(1000); + len = read(fd, events, sizeof(events)); + printf("Number of events read = %ld\n", + len/sizeof(struct inotify_event)); + } + close(fd); + return 0; +} diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index 0470c5f3e690..a1fa2eff8192 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -6,6 +6,7 @@ TEST_GEN_FILES += huge_count_read_write TEST_FILES = _chk_dependency.sh _debugfs_common.sh TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh +TEST_PROGS += debugfs_duplicate_context_creation.sh TEST_PROGS += sysfs.sh include ../lib.mk diff --git a/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh b/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh new file mode 100644 index 000000000000..4a76e37ef16b --- /dev/null +++ b/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source _debugfs_common.sh + +# Test duplicated context creation +# ================================ + +if ! echo foo > "$DBGFS/mk_contexts" +then + echo "context creation failed" + exit 1 +fi + +if echo foo > "$DBGFS/mk_contexts" +then + echo "duplicate context creation success" + exit 1 +fi + +if ! echo foo > "$DBGFS/rm_contexts" +then + echo "context deletion failed" + exit 1 +fi + +exit 0 diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc new file mode 100644 index 000000000000..fc1daac7f066 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc @@ -0,0 +1,27 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Event probe event parser error log check +# requires: dynamic_events events/syscalls/sys_enter_openat "<attached-group>.<attached-event> [<args>]":README error_log + +check_error() { # command-with-error-pos-by-^ + ftrace_errlog_check 'event_probe' "$1" 'dynamic_events' +} + +check_error 'e ^a.' # NO_EVENT_INFO +check_error 'e ^.b' # NO_EVENT_INFO +check_error 'e ^a.b' # BAD_ATTACH_EVENT +check_error 'e syscalls/sys_enter_openat ^foo' # BAD_ATTACH_ARG +check_error 'e:^/bar syscalls/sys_enter_openat' # NO_GROUP_NAME +check_error 'e:^12345678901234567890123456789012345678901234567890123456789012345/bar syscalls/sys_enter_openat' # GROUP_TOO_LONG + +check_error 'e:^foo.1/bar syscalls/sys_enter_openat' # BAD_GROUP_NAME +check_error 'e:^ syscalls/sys_enter_openat' # NO_EVENT_NAME +check_error 'e:foo/^12345678901234567890123456789012345678901234567890123456789012345 syscalls/sys_enter_openat' # EVENT_TOO_LONG +check_error 'e:foo/^bar.1 syscalls/sys_enter_openat' # BAD_EVENT_NAME + +check_error 'e:foo/bar syscalls/sys_enter_openat arg=^dfd' # BAD_FETCH_ARG +check_error 'e:foo/bar syscalls/sys_enter_openat ^arg=$foo' # BAD_ATTACH_ARG + +check_error 'e:foo/bar syscalls/sys_enter_openat if ^' # NO_EP_FILTER + +exit 0 diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc index 3145b0f1835c..8d26d5505808 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc @@ -85,7 +85,7 @@ run_enable_disable() { echo $check_disable > $EVENT_ENABLE done sleep $SLEEP_TIME - echo " make sure it's still works" + echo " make sure it still works" test_event_enabled $check_enable_star reset_ftrace_filter diff --git a/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c b/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c index f8c43ce8fe66..c6b8f32990c8 100644 --- a/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c +++ b/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c @@ -184,7 +184,7 @@ int main(int argc, char *argv[]) /* * If res is non-zero, we either requeued the waiter or hit an * error, break out and handle it. If it is zero, then the - * signal may have hit before the the waiter was blocked on f1. + * signal may have hit before the waiter was blocked on f1. * Try again. */ if (res > 0) { diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index d625a3f83780..2f0d705db9db 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only +/aarch64/aarch32_id_regs /aarch64/arch_timer /aarch64/debug-exceptions /aarch64/get-reg-list @@ -28,6 +29,7 @@ /x86_64/max_vcpuid_cap_test /x86_64/mmio_warning_test /x86_64/monitor_mwait_test +/x86_64/nested_exceptions_test /x86_64/nx_huge_pages_test /x86_64/platform_info_test /x86_64/pmu_event_filter_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 6448cb9f710f..0172eb6cb6ee 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -91,6 +91,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/kvm_clock_test TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test TEST_GEN_PROGS_x86_64 += x86_64/monitor_mwait_test +TEST_GEN_PROGS_x86_64 += x86_64/nested_exceptions_test TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id @@ -146,6 +147,7 @@ TEST_GEN_PROGS_x86_64 += system_counter_offset_test # Compiled outputs used by test targets TEST_GEN_PROGS_EXTENDED_x86_64 += x86_64/nx_huge_pages_test +TEST_GEN_PROGS_aarch64 += aarch64/aarch32_id_regs TEST_GEN_PROGS_aarch64 += aarch64/arch_timer TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list diff --git a/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c b/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c new file mode 100644 index 000000000000..6f9c1f19c7f6 --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * aarch32_id_regs - Test for ID register behavior on AArch64-only systems + * + * Copyright (c) 2022 Google LLC. + * + * Test that KVM handles the AArch64 views of the AArch32 ID registers as RAZ + * and WI from userspace. + */ + +#include <stdint.h> + +#include "kvm_util.h" +#include "processor.h" +#include "test_util.h" + +#define BAD_ID_REG_VAL 0x1badc0deul + +#define GUEST_ASSERT_REG_RAZ(reg) GUEST_ASSERT_EQ(read_sysreg_s(reg), 0) + +static void guest_main(void) +{ + GUEST_ASSERT_REG_RAZ(SYS_ID_PFR0_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_PFR1_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_DFR0_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_AFR0_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR0_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR1_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR2_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR3_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR0_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR1_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR2_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR3_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR4_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR5_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR4_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR6_EL1); + GUEST_ASSERT_REG_RAZ(SYS_MVFR0_EL1); + GUEST_ASSERT_REG_RAZ(SYS_MVFR1_EL1); + GUEST_ASSERT_REG_RAZ(SYS_MVFR2_EL1); + GUEST_ASSERT_REG_RAZ(sys_reg(3, 0, 0, 3, 3)); + GUEST_ASSERT_REG_RAZ(SYS_ID_PFR2_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_DFR1_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR5_EL1); + GUEST_ASSERT_REG_RAZ(sys_reg(3, 0, 0, 3, 7)); + + GUEST_DONE(); +} + +static void test_guest_raz(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_DONE: + break; + default: + TEST_FAIL("Unexpected ucall: %lu", uc.cmd); + } +} + +static uint64_t raz_wi_reg_ids[] = { + KVM_ARM64_SYS_REG(SYS_ID_PFR0_EL1), + KVM_ARM64_SYS_REG(SYS_ID_PFR1_EL1), + KVM_ARM64_SYS_REG(SYS_ID_DFR0_EL1), + KVM_ARM64_SYS_REG(SYS_ID_MMFR0_EL1), + KVM_ARM64_SYS_REG(SYS_ID_MMFR1_EL1), + KVM_ARM64_SYS_REG(SYS_ID_MMFR2_EL1), + KVM_ARM64_SYS_REG(SYS_ID_MMFR3_EL1), + KVM_ARM64_SYS_REG(SYS_ID_ISAR0_EL1), + KVM_ARM64_SYS_REG(SYS_ID_ISAR1_EL1), + KVM_ARM64_SYS_REG(SYS_ID_ISAR2_EL1), + KVM_ARM64_SYS_REG(SYS_ID_ISAR3_EL1), + KVM_ARM64_SYS_REG(SYS_ID_ISAR4_EL1), + KVM_ARM64_SYS_REG(SYS_ID_ISAR5_EL1), + KVM_ARM64_SYS_REG(SYS_ID_MMFR4_EL1), + KVM_ARM64_SYS_REG(SYS_ID_ISAR6_EL1), + KVM_ARM64_SYS_REG(SYS_MVFR0_EL1), + KVM_ARM64_SYS_REG(SYS_MVFR1_EL1), + KVM_ARM64_SYS_REG(SYS_MVFR2_EL1), + KVM_ARM64_SYS_REG(SYS_ID_PFR2_EL1), + KVM_ARM64_SYS_REG(SYS_ID_MMFR5_EL1), +}; + +static void test_user_raz_wi(struct kvm_vcpu *vcpu) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(raz_wi_reg_ids); i++) { + uint64_t reg_id = raz_wi_reg_ids[i]; + uint64_t val; + + vcpu_get_reg(vcpu, reg_id, &val); + ASSERT_EQ(val, 0); + + /* + * Expect the ioctl to succeed with no effect on the register + * value. + */ + vcpu_set_reg(vcpu, reg_id, BAD_ID_REG_VAL); + + vcpu_get_reg(vcpu, reg_id, &val); + ASSERT_EQ(val, 0); + } +} + +static uint64_t raz_invariant_reg_ids[] = { + KVM_ARM64_SYS_REG(SYS_ID_AFR0_EL1), + KVM_ARM64_SYS_REG(sys_reg(3, 0, 0, 3, 3)), + KVM_ARM64_SYS_REG(SYS_ID_DFR1_EL1), + KVM_ARM64_SYS_REG(sys_reg(3, 0, 0, 3, 7)), +}; + +static void test_user_raz_invariant(struct kvm_vcpu *vcpu) +{ + int i, r; + + for (i = 0; i < ARRAY_SIZE(raz_invariant_reg_ids); i++) { + uint64_t reg_id = raz_invariant_reg_ids[i]; + uint64_t val; + + vcpu_get_reg(vcpu, reg_id, &val); + ASSERT_EQ(val, 0); + + r = __vcpu_set_reg(vcpu, reg_id, BAD_ID_REG_VAL); + TEST_ASSERT(r < 0 && errno == EINVAL, + "unexpected KVM_SET_ONE_REG error: r=%d, errno=%d", r, errno); + + vcpu_get_reg(vcpu, reg_id, &val); + ASSERT_EQ(val, 0); + } +} + + + +static bool vcpu_aarch64_only(struct kvm_vcpu *vcpu) +{ + uint64_t val, el0; + + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), &val); + + el0 = (val & ARM64_FEATURE_MASK(ID_AA64PFR0_EL0)) >> ID_AA64PFR0_EL0_SHIFT; + return el0 == ID_AA64PFR0_ELx_64BIT_ONLY; +} + +int main(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + vm = vm_create_with_one_vcpu(&vcpu, guest_main); + + TEST_REQUIRE(vcpu_aarch64_only(vcpu)); + + ucall_init(vm, NULL); + + test_user_raz_wi(vcpu); + test_user_raz_invariant(vcpu); + test_guest_raz(vcpu); + + ucall_uninit(vm); + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c index 2ee35cf9801e..947bd201435c 100644 --- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c +++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c @@ -22,6 +22,7 @@ #define SPSR_SS (1 << 21) extern unsigned char sw_bp, sw_bp2, hw_bp, hw_bp2, bp_svc, bp_brk, hw_wp, ss_start; +extern unsigned char iter_ss_begin, iter_ss_end; static volatile uint64_t sw_bp_addr, hw_bp_addr; static volatile uint64_t wp_addr, wp_data_addr; static volatile uint64_t svc_addr; @@ -238,6 +239,46 @@ static void guest_svc_handler(struct ex_regs *regs) svc_addr = regs->pc; } +enum single_step_op { + SINGLE_STEP_ENABLE = 0, + SINGLE_STEP_DISABLE = 1, +}; + +static void guest_code_ss(int test_cnt) +{ + uint64_t i; + uint64_t bvr, wvr, w_bvr, w_wvr; + + for (i = 0; i < test_cnt; i++) { + /* Bits [1:0] of dbg{b,w}vr are RES0 */ + w_bvr = i << 2; + w_wvr = i << 2; + + /* Enable Single Step execution */ + GUEST_SYNC(SINGLE_STEP_ENABLE); + + /* + * The userspace will veriry that the pc is as expected during + * single step execution between iter_ss_begin and iter_ss_end. + */ + asm volatile("iter_ss_begin:nop\n"); + + write_sysreg(w_bvr, dbgbvr0_el1); + write_sysreg(w_wvr, dbgwvr0_el1); + bvr = read_sysreg(dbgbvr0_el1); + wvr = read_sysreg(dbgwvr0_el1); + + asm volatile("iter_ss_end:\n"); + + /* Disable Single Step execution */ + GUEST_SYNC(SINGLE_STEP_DISABLE); + + GUEST_ASSERT(bvr == w_bvr); + GUEST_ASSERT(wvr == w_wvr); + } + GUEST_DONE(); +} + static int debug_version(struct kvm_vcpu *vcpu) { uint64_t id_aa64dfr0; @@ -246,7 +287,7 @@ static int debug_version(struct kvm_vcpu *vcpu) return id_aa64dfr0 & 0xf; } -int main(int argc, char *argv[]) +static void test_guest_debug_exceptions(void) { struct kvm_vcpu *vcpu; struct kvm_vm *vm; @@ -259,9 +300,6 @@ int main(int argc, char *argv[]) vm_init_descriptor_tables(vm); vcpu_init_descriptor_tables(vcpu); - __TEST_REQUIRE(debug_version(vcpu) >= 6, - "Armv8 debug architecture not supported."); - vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, ESR_EC_BRK_INS, guest_sw_bp_handler); vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, @@ -294,5 +332,108 @@ int main(int argc, char *argv[]) done: kvm_vm_free(vm); +} + +void test_single_step_from_userspace(int test_cnt) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct ucall uc; + struct kvm_run *run; + uint64_t pc, cmd; + uint64_t test_pc = 0; + bool ss_enable = false; + struct kvm_guest_debug debug = {}; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code_ss); + ucall_init(vm, NULL); + run = vcpu->run; + vcpu_args_set(vcpu, 1, test_cnt); + + while (1) { + vcpu_run(vcpu); + if (run->exit_reason != KVM_EXIT_DEBUG) { + cmd = get_ucall(vcpu, &uc); + if (cmd == UCALL_ABORT) { + REPORT_GUEST_ASSERT(uc); + /* NOT REACHED */ + } else if (cmd == UCALL_DONE) { + break; + } + + TEST_ASSERT(cmd == UCALL_SYNC, + "Unexpected ucall cmd 0x%lx", cmd); + + if (uc.args[1] == SINGLE_STEP_ENABLE) { + debug.control = KVM_GUESTDBG_ENABLE | + KVM_GUESTDBG_SINGLESTEP; + ss_enable = true; + } else { + debug.control = SINGLE_STEP_DISABLE; + ss_enable = false; + } + + vcpu_guest_debug_set(vcpu, &debug); + continue; + } + + TEST_ASSERT(ss_enable, "Unexpected KVM_EXIT_DEBUG"); + + /* Check if the current pc is expected. */ + vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pc), &pc); + TEST_ASSERT(!test_pc || pc == test_pc, + "Unexpected pc 0x%lx (expected 0x%lx)", + pc, test_pc); + + /* + * If the current pc is between iter_ss_bgin and + * iter_ss_end, the pc for the next KVM_EXIT_DEBUG should + * be the current pc + 4. + */ + if ((pc >= (uint64_t)&iter_ss_begin) && + (pc < (uint64_t)&iter_ss_end)) + test_pc = pc + 4; + else + test_pc = 0; + } + + kvm_vm_free(vm); +} + +static void help(char *name) +{ + puts(""); + printf("Usage: %s [-h] [-i iterations of the single step test]\n", name); + puts(""); + exit(0); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int opt; + int ss_iteration = 10000; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + __TEST_REQUIRE(debug_version(vcpu) >= 6, + "Armv8 debug architecture not supported."); + kvm_vm_free(vm); + + while ((opt = getopt(argc, argv, "i:")) != -1) { + switch (opt) { + case 'i': + ss_iteration = atoi(optarg); + break; + case 'h': + default: + help(argv[0]); + break; + } + } + + test_guest_debug_exceptions(); + test_single_step_from_userspace(ss_iteration); + return 0; } diff --git a/tools/testing/selftests/kvm/aarch64/psci_test.c b/tools/testing/selftests/kvm/aarch64/psci_test.c index f7621f6e938e..e0b9e81a3e09 100644 --- a/tools/testing/selftests/kvm/aarch64/psci_test.c +++ b/tools/testing/selftests/kvm/aarch64/psci_test.c @@ -1,12 +1,14 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * psci_cpu_on_test - Test that the observable state of a vCPU targeted by the - * CPU_ON PSCI call matches what the caller requested. + * psci_test - Tests relating to KVM's PSCI implementation. * * Copyright (c) 2021 Google LLC. * - * This is a regression test for a race between KVM servicing the PSCI call and - * userspace reading the vCPUs registers. + * This test includes: + * - A regression test for a race between KVM servicing the PSCI CPU_ON call + * and userspace reading the targeted vCPU's registers. + * - A test for KVM's handling of PSCI SYSTEM_SUSPEND and the associated + * KVM_SYSTEM_EVENT_SUSPEND UAPI. */ #define _GNU_SOURCE diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 9c883c94d478..b5234d6efbe1 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -17,6 +17,7 @@ #include <linux/bitmap.h> #include <linux/bitops.h> #include <linux/atomic.h> +#include <asm/barrier.h> #include "kvm_util.h" #include "test_util.h" @@ -264,7 +265,8 @@ static void default_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err) static bool dirty_ring_supported(void) { - return kvm_has_cap(KVM_CAP_DIRTY_LOG_RING); + return (kvm_has_cap(KVM_CAP_DIRTY_LOG_RING) || + kvm_has_cap(KVM_CAP_DIRTY_LOG_RING_ACQ_REL)); } static void dirty_ring_create_vm_done(struct kvm_vm *vm) @@ -279,12 +281,12 @@ static void dirty_ring_create_vm_done(struct kvm_vm *vm) static inline bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn) { - return gfn->flags == KVM_DIRTY_GFN_F_DIRTY; + return smp_load_acquire(&gfn->flags) == KVM_DIRTY_GFN_F_DIRTY; } static inline void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn) { - gfn->flags = KVM_DIRTY_GFN_F_RESET; + smp_store_release(&gfn->flags, KVM_DIRTY_GFN_F_RESET); } static uint32_t dirty_ring_collect_one(struct kvm_dirty_gfn *dirty_gfns, diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 24fde97f6121..e42a09cd24a0 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -175,6 +175,10 @@ extern const struct vm_guest_mode_params vm_guest_mode_params[]; int open_path_or_exit(const char *path, int flags); int open_kvm_dev_path_or_exit(void); + +bool get_kvm_intel_param_bool(const char *param); +bool get_kvm_amd_param_bool(const char *param); + unsigned int kvm_check_cap(long cap); static inline bool kvm_has_cap(long cap) diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index 5c5a88180b6c..befc754ce9b3 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -63,8 +63,10 @@ void test_assert(bool exp, const char *exp_str, #a, #b, #a, (unsigned long) __a, #b, (unsigned long) __b); \ } while (0) -#define TEST_FAIL(fmt, ...) \ - TEST_ASSERT(false, fmt, ##__VA_ARGS__) +#define TEST_FAIL(fmt, ...) do { \ + TEST_ASSERT(false, fmt, ##__VA_ARGS__); \ + __builtin_unreachable(); \ +} while (0) size_t parse_size(const char *size); diff --git a/tools/testing/selftests/kvm/include/x86_64/evmcs.h b/tools/testing/selftests/kvm/include/x86_64/evmcs.h index 3c9260f8e116..58db74f68af2 100644 --- a/tools/testing/selftests/kvm/include/x86_64/evmcs.h +++ b/tools/testing/selftests/kvm/include/x86_64/evmcs.h @@ -203,14 +203,25 @@ struct hv_enlightened_vmcs { u32 reserved:30; } hv_enlightenments_control; u32 hv_vp_id; - + u32 padding32_2; u64 hv_vm_id; u64 partition_assist_page; u64 padding64_4[4]; u64 guest_bndcfgs; - u64 padding64_5[7]; + u64 guest_ia32_perf_global_ctrl; + u64 guest_ia32_s_cet; + u64 guest_ssp; + u64 guest_ia32_int_ssp_table_addr; + u64 guest_ia32_lbr_ctl; + u64 padding64_5[2]; u64 xss_exit_bitmap; - u64 padding64_6[7]; + u64 encls_exiting_bitmap; + u64 host_ia32_perf_global_ctrl; + u64 tsc_multiplier; + u64 host_ia32_s_cet; + u64 host_ssp; + u64 host_ia32_int_ssp_table_addr; + u64 padding64_6; }; #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE 0 @@ -656,6 +667,18 @@ static inline int evmcs_vmread(uint64_t encoding, uint64_t *value) case VIRTUAL_PROCESSOR_ID: *value = current_evmcs->virtual_processor_id; break; + case HOST_IA32_PERF_GLOBAL_CTRL: + *value = current_evmcs->host_ia32_perf_global_ctrl; + break; + case GUEST_IA32_PERF_GLOBAL_CTRL: + *value = current_evmcs->guest_ia32_perf_global_ctrl; + break; + case ENCLS_EXITING_BITMAP: + *value = current_evmcs->encls_exiting_bitmap; + break; + case TSC_MULTIPLIER: + *value = current_evmcs->tsc_multiplier; + break; default: return 1; } @@ -1169,6 +1192,22 @@ static inline int evmcs_vmwrite(uint64_t encoding, uint64_t value) current_evmcs->virtual_processor_id = value; current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT; break; + case HOST_IA32_PERF_GLOBAL_CTRL: + current_evmcs->host_ia32_perf_global_ctrl = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; + break; + case GUEST_IA32_PERF_GLOBAL_CTRL: + current_evmcs->guest_ia32_perf_global_ctrl = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; + break; + case ENCLS_EXITING_BITMAP: + current_evmcs->encls_exiting_bitmap = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2; + break; + case TSC_MULTIPLIER: + current_evmcs->tsc_multiplier = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2; + break; default: return 1; } diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 0cbc71b7af50..e8ca0d8a6a7e 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -825,6 +825,8 @@ static inline uint8_t wrmsr_safe(uint32_t msr, uint64_t val) return kvm_asm_safe("wrmsr", "a"(val & -1u), "d"(val >> 32), "c"(msr)); } +bool kvm_is_tdp_enabled(void); + uint64_t vm_get_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu, uint64_t vaddr); void vm_set_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu, @@ -855,6 +857,8 @@ enum pg_level { #define PG_SIZE_1G PG_LEVEL_SIZE(PG_LEVEL_1G) void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level); +void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + uint64_t nr_bytes, int level); /* * Basic CPU control in CR0 diff --git a/tools/testing/selftests/kvm/include/x86_64/svm_util.h b/tools/testing/selftests/kvm/include/x86_64/svm_util.h index a339b537a575..7aee6244ab6a 100644 --- a/tools/testing/selftests/kvm/include/x86_64/svm_util.h +++ b/tools/testing/selftests/kvm/include/x86_64/svm_util.h @@ -9,15 +9,12 @@ #ifndef SELFTEST_KVM_SVM_UTILS_H #define SELFTEST_KVM_SVM_UTILS_H +#include <asm/svm.h> + #include <stdint.h> #include "svm.h" #include "processor.h" -#define SVM_EXIT_EXCP_BASE 0x040 -#define SVM_EXIT_HLT 0x078 -#define SVM_EXIT_MSR 0x07c -#define SVM_EXIT_VMMCALL 0x081 - struct svm_test_data { /* VMCB */ struct vmcb *vmcb; /* gva */ diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h index 790c6d1ecb34..71b290b6469d 100644 --- a/tools/testing/selftests/kvm/include/x86_64/vmx.h +++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h @@ -8,6 +8,8 @@ #ifndef SELFTEST_KVM_VMX_H #define SELFTEST_KVM_VMX_H +#include <asm/vmx.h> + #include <stdint.h> #include "processor.h" #include "apic.h" @@ -100,55 +102,6 @@ #define VMX_EPT_VPID_CAP_AD_BITS 0x00200000 #define EXIT_REASON_FAILED_VMENTRY 0x80000000 -#define EXIT_REASON_EXCEPTION_NMI 0 -#define EXIT_REASON_EXTERNAL_INTERRUPT 1 -#define EXIT_REASON_TRIPLE_FAULT 2 -#define EXIT_REASON_INTERRUPT_WINDOW 7 -#define EXIT_REASON_NMI_WINDOW 8 -#define EXIT_REASON_TASK_SWITCH 9 -#define EXIT_REASON_CPUID 10 -#define EXIT_REASON_HLT 12 -#define EXIT_REASON_INVD 13 -#define EXIT_REASON_INVLPG 14 -#define EXIT_REASON_RDPMC 15 -#define EXIT_REASON_RDTSC 16 -#define EXIT_REASON_VMCALL 18 -#define EXIT_REASON_VMCLEAR 19 -#define EXIT_REASON_VMLAUNCH 20 -#define EXIT_REASON_VMPTRLD 21 -#define EXIT_REASON_VMPTRST 22 -#define EXIT_REASON_VMREAD 23 -#define EXIT_REASON_VMRESUME 24 -#define EXIT_REASON_VMWRITE 25 -#define EXIT_REASON_VMOFF 26 -#define EXIT_REASON_VMON 27 -#define EXIT_REASON_CR_ACCESS 28 -#define EXIT_REASON_DR_ACCESS 29 -#define EXIT_REASON_IO_INSTRUCTION 30 -#define EXIT_REASON_MSR_READ 31 -#define EXIT_REASON_MSR_WRITE 32 -#define EXIT_REASON_INVALID_STATE 33 -#define EXIT_REASON_MWAIT_INSTRUCTION 36 -#define EXIT_REASON_MONITOR_INSTRUCTION 39 -#define EXIT_REASON_PAUSE_INSTRUCTION 40 -#define EXIT_REASON_MCE_DURING_VMENTRY 41 -#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 -#define EXIT_REASON_APIC_ACCESS 44 -#define EXIT_REASON_EOI_INDUCED 45 -#define EXIT_REASON_EPT_VIOLATION 48 -#define EXIT_REASON_EPT_MISCONFIG 49 -#define EXIT_REASON_INVEPT 50 -#define EXIT_REASON_RDTSCP 51 -#define EXIT_REASON_PREEMPTION_TIMER 52 -#define EXIT_REASON_INVVPID 53 -#define EXIT_REASON_WBINVD 54 -#define EXIT_REASON_XSETBV 55 -#define EXIT_REASON_APIC_WRITE 56 -#define EXIT_REASON_INVPCID 58 -#define EXIT_REASON_PML_FULL 62 -#define EXIT_REASON_XSAVES 63 -#define EXIT_REASON_XRSTORS 64 -#define LAST_EXIT_REASON 64 enum vmcs_field { VIRTUAL_PROCESSOR_ID = 0x00000000, @@ -208,6 +161,8 @@ enum vmcs_field { VMWRITE_BITMAP_HIGH = 0x00002029, XSS_EXIT_BITMAP = 0x0000202C, XSS_EXIT_BITMAP_HIGH = 0x0000202D, + ENCLS_EXITING_BITMAP = 0x0000202E, + ENCLS_EXITING_BITMAP_HIGH = 0x0000202F, TSC_MULTIPLIER = 0x00002032, TSC_MULTIPLIER_HIGH = 0x00002033, GUEST_PHYSICAL_ADDRESS = 0x00002400, diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 9889fe0d8919..f1cb1627161f 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -50,6 +50,45 @@ int open_kvm_dev_path_or_exit(void) return _open_kvm_dev_path_or_exit(O_RDONLY); } +static bool get_module_param_bool(const char *module_name, const char *param) +{ + const int path_size = 128; + char path[path_size]; + char value; + ssize_t r; + int fd; + + r = snprintf(path, path_size, "/sys/module/%s/parameters/%s", + module_name, param); + TEST_ASSERT(r < path_size, + "Failed to construct sysfs path in %d bytes.", path_size); + + fd = open_path_or_exit(path, O_RDONLY); + + r = read(fd, &value, 1); + TEST_ASSERT(r == 1, "read(%s) failed", path); + + r = close(fd); + TEST_ASSERT(!r, "close(%s) failed", path); + + if (value == 'Y') + return true; + else if (value == 'N') + return false; + + TEST_FAIL("Unrecognized value '%c' for boolean module param", value); +} + +bool get_kvm_intel_param_bool(const char *param) +{ + return get_module_param_bool("kvm_intel", param); +} + +bool get_kvm_amd_param_bool(const char *param) +{ + return get_module_param_bool("kvm_amd", param); +} + /* * Capability * @@ -82,7 +121,10 @@ unsigned int kvm_check_cap(long cap) void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size) { - vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING, ring_size); + if (vm_check_cap(vm, KVM_CAP_DIRTY_LOG_RING_ACQ_REL)) + vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING_ACQ_REL, ring_size); + else + vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING, ring_size); vm->dirty_ring_size = ring_size; } diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 2e6e61bbe81b..39c4409ef56a 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -111,6 +111,14 @@ static void sregs_dump(FILE *stream, struct kvm_sregs *sregs, uint8_t indent) } } +bool kvm_is_tdp_enabled(void) +{ + if (is_intel_cpu()) + return get_kvm_intel_param_bool("ept"); + else + return get_kvm_amd_param_bool("npt"); +} + void virt_arch_pgd_alloc(struct kvm_vm *vm) { TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " @@ -214,6 +222,25 @@ void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) __virt_pg_map(vm, vaddr, paddr, PG_LEVEL_4K); } +void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + uint64_t nr_bytes, int level) +{ + uint64_t pg_size = PG_LEVEL_SIZE(level); + uint64_t nr_pages = nr_bytes / pg_size; + int i; + + TEST_ASSERT(nr_bytes % pg_size == 0, + "Region size not aligned: nr_bytes: 0x%lx, page size: 0x%lx", + nr_bytes, pg_size); + + for (i = 0; i < nr_pages; i++) { + __virt_pg_map(vm, vaddr, paddr, level); + + vaddr += pg_size; + paddr += pg_size; + } +} + static uint64_t *_vm_get_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu, uint64_t vaddr) @@ -1294,20 +1321,9 @@ done: /* Returns true if kvm_intel was loaded with unrestricted_guest=1. */ bool vm_is_unrestricted_guest(struct kvm_vm *vm) { - char val = 'N'; - size_t count; - FILE *f; - /* Ensure that a KVM vendor-specific module is loaded. */ if (vm == NULL) close(open_kvm_dev_path_or_exit()); - f = fopen("/sys/module/kvm_intel/parameters/unrestricted_guest", "r"); - if (f) { - count = fread(&val, sizeof(char), 1, f); - TEST_ASSERT(count == 1, "Unable to read from param file."); - fclose(f); - } - - return val == 'Y'; + return get_kvm_intel_param_bool("unrestricted_guest"); } diff --git a/tools/testing/selftests/kvm/lib/x86_64/svm.c b/tools/testing/selftests/kvm/lib/x86_64/svm.c index 6d445886e16c..5495a92dfd5a 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/svm.c +++ b/tools/testing/selftests/kvm/lib/x86_64/svm.c @@ -60,18 +60,6 @@ static void vmcb_set_seg(struct vmcb_seg *seg, u16 selector, seg->base = base; } -/* - * Avoid using memset to clear the vmcb, since libc may not be - * available in L1 (and, even if it is, features that libc memset may - * want to use, like AVX, may not be enabled). - */ -static void clear_vmcb(struct vmcb *vmcb) -{ - int n = sizeof(*vmcb) / sizeof(u32); - - asm volatile ("rep stosl" : "+c"(n), "+D"(vmcb) : "a"(0) : "memory"); -} - void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_rsp) { struct vmcb *vmcb = svm->vmcb; @@ -88,7 +76,7 @@ void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_r wrmsr(MSR_EFER, efer | EFER_SVME); wrmsr(MSR_VM_HSAVE_PA, svm->save_area_gpa); - clear_vmcb(vmcb); + memset(vmcb, 0, sizeof(*vmcb)); asm volatile ("vmsave %0\n\t" : : "a" (vmcb_gpa) : "memory"); vmcb_set_seg(&save->es, get_es(), 0, -1U, data_seg_attr); vmcb_set_seg(&save->cs, get_cs(), 0, -1U, code_seg_attr); diff --git a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c index e0004bd26536..32f7e09ef67c 100644 --- a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c +++ b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c @@ -17,84 +17,70 @@ /* VMCALL and VMMCALL are both 3-byte opcodes. */ #define HYPERCALL_INSN_SIZE 3 -static bool ud_expected; +static bool quirk_disabled; static void guest_ud_handler(struct ex_regs *regs) { - GUEST_ASSERT(ud_expected); - GUEST_DONE(); + regs->rax = -EFAULT; + regs->rip += HYPERCALL_INSN_SIZE; } -extern uint8_t svm_hypercall_insn[HYPERCALL_INSN_SIZE]; -static uint64_t svm_do_sched_yield(uint8_t apic_id) -{ - uint64_t ret; +static const uint8_t vmx_vmcall[HYPERCALL_INSN_SIZE] = { 0x0f, 0x01, 0xc1 }; +static const uint8_t svm_vmmcall[HYPERCALL_INSN_SIZE] = { 0x0f, 0x01, 0xd9 }; - asm volatile("mov %1, %%rax\n\t" - "mov %2, %%rbx\n\t" - "svm_hypercall_insn:\n\t" - "vmmcall\n\t" - "mov %%rax, %0\n\t" - : "=r"(ret) - : "r"((uint64_t)KVM_HC_SCHED_YIELD), "r"((uint64_t)apic_id) - : "rax", "rbx", "memory"); - - return ret; -} - -extern uint8_t vmx_hypercall_insn[HYPERCALL_INSN_SIZE]; -static uint64_t vmx_do_sched_yield(uint8_t apic_id) +extern uint8_t hypercall_insn[HYPERCALL_INSN_SIZE]; +static uint64_t do_sched_yield(uint8_t apic_id) { uint64_t ret; - asm volatile("mov %1, %%rax\n\t" - "mov %2, %%rbx\n\t" - "vmx_hypercall_insn:\n\t" - "vmcall\n\t" - "mov %%rax, %0\n\t" - : "=r"(ret) - : "r"((uint64_t)KVM_HC_SCHED_YIELD), "r"((uint64_t)apic_id) - : "rax", "rbx", "memory"); + asm volatile("hypercall_insn:\n\t" + ".byte 0xcc,0xcc,0xcc\n\t" + : "=a"(ret) + : "a"((uint64_t)KVM_HC_SCHED_YIELD), "b"((uint64_t)apic_id) + : "memory"); return ret; } static void guest_main(void) { - uint8_t *native_hypercall_insn, *hypercall_insn; - uint8_t apic_id; - - apic_id = GET_APIC_ID_FIELD(xapic_read_reg(APIC_ID)); + const uint8_t *native_hypercall_insn; + const uint8_t *other_hypercall_insn; + uint64_t ret; if (is_intel_cpu()) { - native_hypercall_insn = vmx_hypercall_insn; - hypercall_insn = svm_hypercall_insn; - svm_do_sched_yield(apic_id); + native_hypercall_insn = vmx_vmcall; + other_hypercall_insn = svm_vmmcall; } else if (is_amd_cpu()) { - native_hypercall_insn = svm_hypercall_insn; - hypercall_insn = vmx_hypercall_insn; - vmx_do_sched_yield(apic_id); + native_hypercall_insn = svm_vmmcall; + other_hypercall_insn = vmx_vmcall; } else { GUEST_ASSERT(0); /* unreachable */ return; } + memcpy(hypercall_insn, other_hypercall_insn, HYPERCALL_INSN_SIZE); + + ret = do_sched_yield(GET_APIC_ID_FIELD(xapic_read_reg(APIC_ID))); + /* - * The hypercall didn't #UD (guest_ud_handler() signals "done" if a #UD - * occurs). Verify that a #UD is NOT expected and that KVM patched in - * the native hypercall. + * If the quirk is disabled, verify that guest_ud_handler() "returned" + * -EFAULT and that KVM did NOT patch the hypercall. If the quirk is + * enabled, verify that the hypercall succeeded and that KVM patched in + * the "right" hypercall. */ - GUEST_ASSERT(!ud_expected); - GUEST_ASSERT(!memcmp(native_hypercall_insn, hypercall_insn, HYPERCALL_INSN_SIZE)); - GUEST_DONE(); -} + if (quirk_disabled) { + GUEST_ASSERT(ret == (uint64_t)-EFAULT); + GUEST_ASSERT(!memcmp(other_hypercall_insn, hypercall_insn, + HYPERCALL_INSN_SIZE)); + } else { + GUEST_ASSERT(!ret); + GUEST_ASSERT(!memcmp(native_hypercall_insn, hypercall_insn, + HYPERCALL_INSN_SIZE)); + } -static void setup_ud_vector(struct kvm_vcpu *vcpu) -{ - vm_init_descriptor_tables(vcpu->vm); - vcpu_init_descriptor_tables(vcpu); - vm_install_exception_handler(vcpu->vm, UD_VECTOR, guest_ud_handler); + GUEST_DONE(); } static void enter_guest(struct kvm_vcpu *vcpu) @@ -117,35 +103,23 @@ static void enter_guest(struct kvm_vcpu *vcpu) } } -static void test_fix_hypercall(void) +static void test_fix_hypercall(bool disable_quirk) { struct kvm_vcpu *vcpu; struct kvm_vm *vm; vm = vm_create_with_one_vcpu(&vcpu, guest_main); - setup_ud_vector(vcpu); - - ud_expected = false; - sync_global_to_guest(vm, ud_expected); - - virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); - - enter_guest(vcpu); -} -static void test_fix_hypercall_disabled(void) -{ - struct kvm_vcpu *vcpu; - struct kvm_vm *vm; - - vm = vm_create_with_one_vcpu(&vcpu, guest_main); - setup_ud_vector(vcpu); + vm_init_descriptor_tables(vcpu->vm); + vcpu_init_descriptor_tables(vcpu); + vm_install_exception_handler(vcpu->vm, UD_VECTOR, guest_ud_handler); - vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, - KVM_X86_QUIRK_FIX_HYPERCALL_INSN); + if (disable_quirk) + vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, + KVM_X86_QUIRK_FIX_HYPERCALL_INSN); - ud_expected = true; - sync_global_to_guest(vm, ud_expected); + quirk_disabled = disable_quirk; + sync_global_to_guest(vm, quirk_disabled); virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); @@ -156,6 +130,6 @@ int main(void) { TEST_REQUIRE(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & KVM_X86_QUIRK_FIX_HYPERCALL_INSN); - test_fix_hypercall(); - test_fix_hypercall_disabled(); + test_fix_hypercall(false); + test_fix_hypercall(true); } diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c index 79ab0152d281..05b32e550a80 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c @@ -26,7 +26,8 @@ static inline uint8_t hypercall(u64 control, vm_vaddr_t input_address, : "=a" (*hv_status), "+c" (control), "+d" (input_address), KVM_ASM_SAFE_OUTPUTS(vector) - : [output_address] "r"(output_address) + : [output_address] "r"(output_address), + "a" (-EFAULT) : "cc", "memory", "r8", KVM_ASM_SAFE_CLOBBERS); return vector; } @@ -81,13 +82,13 @@ static void guest_hcall(vm_vaddr_t pgs_gpa, struct hcall_data *hcall) } vector = hypercall(hcall->control, input, output, &res); - if (hcall->ud_expected) + if (hcall->ud_expected) { GUEST_ASSERT_2(vector == UD_VECTOR, hcall->control, vector); - else + } else { GUEST_ASSERT_2(!vector, hcall->control, vector); + GUEST_ASSERT_2(res == hcall->expect, hcall->expect, res); + } - GUEST_ASSERT_2(!hcall->ud_expected || res == hcall->expect, - hcall->expect, res); GUEST_DONE(); } @@ -507,7 +508,7 @@ static void guest_test_hcalls_access(void) switch (stage) { case 0: feat->eax |= HV_MSR_HYPERCALL_AVAILABLE; - hcall->control = 0xdeadbeef; + hcall->control = 0xbeef; hcall->expect = HV_STATUS_INVALID_HYPERCALL_CODE; break; diff --git a/tools/testing/selftests/kvm/x86_64/nested_exceptions_test.c b/tools/testing/selftests/kvm/x86_64/nested_exceptions_test.c new file mode 100644 index 000000000000..ac33835f78f4 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/nested_exceptions_test.c @@ -0,0 +1,295 @@ +// SPDX-License-Identifier: GPL-2.0-only +#define _GNU_SOURCE /* for program_invocation_short_name */ + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" +#include "svm_util.h" + +#define L2_GUEST_STACK_SIZE 256 + +/* + * Arbitrary, never shoved into KVM/hardware, just need to avoid conflict with + * the "real" exceptions used, #SS/#GP/#DF (12/13/8). + */ +#define FAKE_TRIPLE_FAULT_VECTOR 0xaa + +/* Arbitrary 32-bit error code injected by this test. */ +#define SS_ERROR_CODE 0xdeadbeef + +/* + * Bit '0' is set on Intel if the exception occurs while delivering a previous + * event/exception. AMD's wording is ambiguous, but presumably the bit is set + * if the exception occurs while delivering an external event, e.g. NMI or INTR, + * but not for exceptions that occur when delivering other exceptions or + * software interrupts. + * + * Note, Intel's name for it, "External event", is misleading and much more + * aligned with AMD's behavior, but the SDM is quite clear on its behavior. + */ +#define ERROR_CODE_EXT_FLAG BIT(0) + +/* + * Bit '1' is set if the fault occurred when looking up a descriptor in the + * IDT, which is the case here as the IDT is empty/NULL. + */ +#define ERROR_CODE_IDT_FLAG BIT(1) + +/* + * The #GP that occurs when vectoring #SS should show the index into the IDT + * for #SS, plus have the "IDT flag" set. + */ +#define GP_ERROR_CODE_AMD ((SS_VECTOR * 8) | ERROR_CODE_IDT_FLAG) +#define GP_ERROR_CODE_INTEL ((SS_VECTOR * 8) | ERROR_CODE_IDT_FLAG | ERROR_CODE_EXT_FLAG) + +/* + * Intel and AMD both shove '0' into the error code on #DF, regardless of what + * led to the double fault. + */ +#define DF_ERROR_CODE 0 + +#define INTERCEPT_SS (BIT_ULL(SS_VECTOR)) +#define INTERCEPT_SS_DF (INTERCEPT_SS | BIT_ULL(DF_VECTOR)) +#define INTERCEPT_SS_GP_DF (INTERCEPT_SS_DF | BIT_ULL(GP_VECTOR)) + +static void l2_ss_pending_test(void) +{ + GUEST_SYNC(SS_VECTOR); +} + +static void l2_ss_injected_gp_test(void) +{ + GUEST_SYNC(GP_VECTOR); +} + +static void l2_ss_injected_df_test(void) +{ + GUEST_SYNC(DF_VECTOR); +} + +static void l2_ss_injected_tf_test(void) +{ + GUEST_SYNC(FAKE_TRIPLE_FAULT_VECTOR); +} + +static void svm_run_l2(struct svm_test_data *svm, void *l2_code, int vector, + uint32_t error_code) +{ + struct vmcb *vmcb = svm->vmcb; + struct vmcb_control_area *ctrl = &vmcb->control; + + vmcb->save.rip = (u64)l2_code; + run_guest(vmcb, svm->vmcb_gpa); + + if (vector == FAKE_TRIPLE_FAULT_VECTOR) + return; + + GUEST_ASSERT_EQ(ctrl->exit_code, (SVM_EXIT_EXCP_BASE + vector)); + GUEST_ASSERT_EQ(ctrl->exit_info_1, error_code); +} + +static void l1_svm_code(struct svm_test_data *svm) +{ + struct vmcb_control_area *ctrl = &svm->vmcb->control; + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + + generic_svm_setup(svm, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + svm->vmcb->save.idtr.limit = 0; + ctrl->intercept |= BIT_ULL(INTERCEPT_SHUTDOWN); + + ctrl->intercept_exceptions = INTERCEPT_SS_GP_DF; + svm_run_l2(svm, l2_ss_pending_test, SS_VECTOR, SS_ERROR_CODE); + svm_run_l2(svm, l2_ss_injected_gp_test, GP_VECTOR, GP_ERROR_CODE_AMD); + + ctrl->intercept_exceptions = INTERCEPT_SS_DF; + svm_run_l2(svm, l2_ss_injected_df_test, DF_VECTOR, DF_ERROR_CODE); + + ctrl->intercept_exceptions = INTERCEPT_SS; + svm_run_l2(svm, l2_ss_injected_tf_test, FAKE_TRIPLE_FAULT_VECTOR, 0); + GUEST_ASSERT_EQ(ctrl->exit_code, SVM_EXIT_SHUTDOWN); + + GUEST_DONE(); +} + +static void vmx_run_l2(void *l2_code, int vector, uint32_t error_code) +{ + GUEST_ASSERT(!vmwrite(GUEST_RIP, (u64)l2_code)); + + GUEST_ASSERT_EQ(vector == SS_VECTOR ? vmlaunch() : vmresume(), 0); + + if (vector == FAKE_TRIPLE_FAULT_VECTOR) + return; + + GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_EXCEPTION_NMI); + GUEST_ASSERT_EQ((vmreadz(VM_EXIT_INTR_INFO) & 0xff), vector); + GUEST_ASSERT_EQ(vmreadz(VM_EXIT_INTR_ERROR_CODE), error_code); +} + +static void l1_vmx_code(struct vmx_pages *vmx) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + + GUEST_ASSERT_EQ(prepare_for_vmx_operation(vmx), true); + + GUEST_ASSERT_EQ(load_vmcs(vmx), true); + + prepare_vmcs(vmx, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + GUEST_ASSERT_EQ(vmwrite(GUEST_IDTR_LIMIT, 0), 0); + + /* + * VMX disallows injecting an exception with error_code[31:16] != 0, + * and hardware will never generate a VM-Exit with bits 31:16 set. + * KVM should likewise truncate the "bad" userspace value. + */ + GUEST_ASSERT_EQ(vmwrite(EXCEPTION_BITMAP, INTERCEPT_SS_GP_DF), 0); + vmx_run_l2(l2_ss_pending_test, SS_VECTOR, (u16)SS_ERROR_CODE); + vmx_run_l2(l2_ss_injected_gp_test, GP_VECTOR, GP_ERROR_CODE_INTEL); + + GUEST_ASSERT_EQ(vmwrite(EXCEPTION_BITMAP, INTERCEPT_SS_DF), 0); + vmx_run_l2(l2_ss_injected_df_test, DF_VECTOR, DF_ERROR_CODE); + + GUEST_ASSERT_EQ(vmwrite(EXCEPTION_BITMAP, INTERCEPT_SS), 0); + vmx_run_l2(l2_ss_injected_tf_test, FAKE_TRIPLE_FAULT_VECTOR, 0); + GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_TRIPLE_FAULT); + + GUEST_DONE(); +} + +static void __attribute__((__flatten__)) l1_guest_code(void *test_data) +{ + if (this_cpu_has(X86_FEATURE_SVM)) + l1_svm_code(test_data); + else + l1_vmx_code(test_data); +} + +static void assert_ucall_vector(struct kvm_vcpu *vcpu, int vector) +{ + struct kvm_run *run = vcpu->run; + struct ucall uc; + + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Unexpected exit reason: %u (%s),\n", + run->exit_reason, exit_reason_str(run->exit_reason)); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + TEST_ASSERT(vector == uc.args[1], + "Expected L2 to ask for %d, got %ld", vector, uc.args[1]); + break; + case UCALL_DONE: + TEST_ASSERT(vector == -1, + "Expected L2 to ask for %d, L2 says it's done", vector); + break; + case UCALL_ABORT: + TEST_FAIL("%s at %s:%ld (0x%lx != 0x%lx)", + (const char *)uc.args[0], __FILE__, uc.args[1], + uc.args[2], uc.args[3]); + break; + default: + TEST_FAIL("Expected L2 to ask for %d, got unexpected ucall %lu", vector, uc.cmd); + } +} + +static void queue_ss_exception(struct kvm_vcpu *vcpu, bool inject) +{ + struct kvm_vcpu_events events; + + vcpu_events_get(vcpu, &events); + + TEST_ASSERT(!events.exception.pending, + "Vector %d unexpectedlt pending", events.exception.nr); + TEST_ASSERT(!events.exception.injected, + "Vector %d unexpectedly injected", events.exception.nr); + + events.flags = KVM_VCPUEVENT_VALID_PAYLOAD; + events.exception.pending = !inject; + events.exception.injected = inject; + events.exception.nr = SS_VECTOR; + events.exception.has_error_code = true; + events.exception.error_code = SS_ERROR_CODE; + vcpu_events_set(vcpu, &events); +} + +/* + * Verify KVM_{G,S}ET_EVENTS play nice with pending vs. injected exceptions + * when an exception is being queued for L2. Specifically, verify that KVM + * honors L1 exception intercept controls when a #SS is pending/injected, + * triggers a #GP on vectoring the #SS, morphs to #DF if #GP isn't intercepted + * by L1, and finally causes (nested) SHUTDOWN if #DF isn't intercepted by L1. + */ +int main(int argc, char *argv[]) +{ + vm_vaddr_t nested_test_data_gva; + struct kvm_vcpu_events events; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_EXCEPTION_PAYLOAD)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM) || kvm_cpu_has(X86_FEATURE_VMX)); + + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + vm_enable_cap(vm, KVM_CAP_EXCEPTION_PAYLOAD, -2ul); + + if (kvm_cpu_has(X86_FEATURE_SVM)) + vcpu_alloc_svm(vm, &nested_test_data_gva); + else + vcpu_alloc_vmx(vm, &nested_test_data_gva); + + vcpu_args_set(vcpu, 1, nested_test_data_gva); + + /* Run L1 => L2. L2 should sync and request #SS. */ + vcpu_run(vcpu); + assert_ucall_vector(vcpu, SS_VECTOR); + + /* Pend #SS and request immediate exit. #SS should still be pending. */ + queue_ss_exception(vcpu, false); + vcpu->run->immediate_exit = true; + vcpu_run_complete_io(vcpu); + + /* Verify the pending events comes back out the same as it went in. */ + vcpu_events_get(vcpu, &events); + ASSERT_EQ(events.flags & KVM_VCPUEVENT_VALID_PAYLOAD, + KVM_VCPUEVENT_VALID_PAYLOAD); + ASSERT_EQ(events.exception.pending, true); + ASSERT_EQ(events.exception.nr, SS_VECTOR); + ASSERT_EQ(events.exception.has_error_code, true); + ASSERT_EQ(events.exception.error_code, SS_ERROR_CODE); + + /* + * Run for real with the pending #SS, L1 should get a VM-Exit due to + * #SS interception and re-enter L2 to request #GP (via injected #SS). + */ + vcpu->run->immediate_exit = false; + vcpu_run(vcpu); + assert_ucall_vector(vcpu, GP_VECTOR); + + /* + * Inject #SS, the #SS should bypass interception and cause #GP, which + * L1 should intercept before KVM morphs it to #DF. L1 should then + * disable #GP interception and run L2 to request #DF (via #SS => #GP). + */ + queue_ss_exception(vcpu, true); + vcpu_run(vcpu); + assert_ucall_vector(vcpu, DF_VECTOR); + + /* + * Inject #SS, the #SS should bypass interception and cause #GP, which + * L1 is no longer interception, and so should see a #DF VM-Exit. L1 + * should then signal that is done. + */ + queue_ss_exception(vcpu, true); + vcpu_run(vcpu); + assert_ucall_vector(vcpu, FAKE_TRIPLE_FAULT_VECTOR); + + /* + * Inject #SS yet again. L1 is not intercepting #GP or #DF, and so + * should see nested TRIPLE_FAULT / SHUTDOWN. + */ + queue_ss_exception(vcpu, true); + vcpu_run(vcpu); + assert_ucall_vector(vcpu, -1); + + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c index cc6421716400..59ffe7fd354f 100644 --- a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c +++ b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c @@ -112,19 +112,13 @@ void run_test(int reclaim_period_ms, bool disable_nx_huge_pages, { struct kvm_vcpu *vcpu; struct kvm_vm *vm; + uint64_t nr_bytes; void *hva; int r; vm = vm_create(1); if (disable_nx_huge_pages) { - /* - * Cannot run the test without NX huge pages if the kernel - * does not support it. - */ - if (!kvm_check_cap(KVM_CAP_VM_DISABLE_NX_HUGE_PAGES)) - return; - r = __vm_disable_nx_huge_pages(vm); if (reboot_permissions) { TEST_ASSERT(!r, "Disabling NX huge pages should succeed if process has reboot permissions"); @@ -141,10 +135,24 @@ void run_test(int reclaim_period_ms, bool disable_nx_huge_pages, HPAGE_GPA, HPAGE_SLOT, HPAGE_SLOT_NPAGES, 0); - virt_map(vm, HPAGE_GVA, HPAGE_GPA, HPAGE_SLOT_NPAGES); + nr_bytes = HPAGE_SLOT_NPAGES * vm->page_size; + + /* + * Ensure that KVM can map HPAGE_SLOT with huge pages by mapping the + * region into the guest with 2MiB pages whenever TDP is disabled (i.e. + * whenever KVM is shadowing the guest page tables). + * + * When TDP is enabled, KVM should be able to map HPAGE_SLOT with huge + * pages irrespective of the guest page size, so map with 4KiB pages + * to test that that is the case. + */ + if (kvm_is_tdp_enabled()) + virt_map_level(vm, HPAGE_GVA, HPAGE_GPA, nr_bytes, PG_LEVEL_4K); + else + virt_map_level(vm, HPAGE_GVA, HPAGE_GPA, nr_bytes, PG_LEVEL_2M); hva = addr_gpa2hva(vm, HPAGE_GPA); - memset(hva, RETURN_OPCODE, HPAGE_SLOT_NPAGES * PAGE_SIZE); + memset(hva, RETURN_OPCODE, nr_bytes); check_2m_page_count(vm, 0); check_split_count(vm, 0); @@ -248,18 +256,13 @@ int main(int argc, char **argv) } } - if (token != MAGIC_TOKEN) { - print_skip("This test must be run with the magic token %d.\n" - "This is done by nx_huge_pages_test.sh, which\n" - "also handles environment setup for the test.", - MAGIC_TOKEN); - exit(KSFT_SKIP); - } + TEST_REQUIRE(kvm_has_cap(KVM_CAP_VM_DISABLE_NX_HUGE_PAGES)); + TEST_REQUIRE(reclaim_period_ms > 0); - if (!reclaim_period_ms) { - print_skip("The NX reclaim period must be specified and non-zero"); - exit(KSFT_SKIP); - } + __TEST_REQUIRE(token == MAGIC_TOKEN, + "This test must be run with the magic token %d.\n" + "This is done by nx_huge_pages_test.sh, which\n" + "also handles environment setup for the test."); run_test(reclaim_period_ms, false, reboot_permissions); run_test(reclaim_period_ms, true, reboot_permissions); diff --git a/tools/testing/selftests/livepatch/Makefile b/tools/testing/selftests/livepatch/Makefile index 1acc9e1fa3fb..02fadc9d55e0 100644 --- a/tools/testing/selftests/livepatch/Makefile +++ b/tools/testing/selftests/livepatch/Makefile @@ -6,7 +6,8 @@ TEST_PROGS := \ test-callbacks.sh \ test-shadow-vars.sh \ test-state.sh \ - test-ftrace.sh + test-ftrace.sh \ + test-sysfs.sh TEST_FILES := settings diff --git a/tools/testing/selftests/livepatch/functions.sh b/tools/testing/selftests/livepatch/functions.sh index 9230b869371d..c8416c54b463 100644 --- a/tools/testing/selftests/livepatch/functions.sh +++ b/tools/testing/selftests/livepatch/functions.sh @@ -6,6 +6,7 @@ MAX_RETRIES=600 RETRY_INTERVAL=".1" # seconds +KLP_SYSFS_DIR="/sys/kernel/livepatch" # Kselftest framework requirement - SKIP code is 4 ksft_skip=4 @@ -86,7 +87,7 @@ function set_ftrace_enabled() { if [[ "$result" != "$1" ]] ; then if [[ $can_fail -eq 1 ]] ; then - echo "livepatch: $err" > /dev/kmsg + echo "livepatch: $err" | sed 's#/proc/sys/kernel/#kernel.#' > /dev/kmsg return fi @@ -308,3 +309,36 @@ function check_result { cleanup_dmesg_file } + +# check_sysfs_rights(modname, rel_path, expected_rights) - check sysfs +# path permissions +# modname - livepatch module creating the sysfs interface +# rel_path - relative path of the sysfs interface +# expected_rights - expected access rights +function check_sysfs_rights() { + local mod="$1"; shift + local rel_path="$1"; shift + local expected_rights="$1"; shift + + local path="$KLP_SYSFS_DIR/$mod/$rel_path" + local rights=$(/bin/stat --format '%A' "$path") + if test "$rights" != "$expected_rights" ; then + die "Unexpected access rights of $path: $expected_rights vs. $rights" + fi +} + +# check_sysfs_value(modname, rel_path, expected_value) - check sysfs value +# modname - livepatch module creating the sysfs interface +# rel_path - relative path of the sysfs interface +# expected_value - expected value read from the file +function check_sysfs_value() { + local mod="$1"; shift + local rel_path="$1"; shift + local expected_value="$1"; shift + + local path="$KLP_SYSFS_DIR/$mod/$rel_path" + local value=`cat $path` + if test "$value" != "$expected_value" ; then + die "Unexpected value in $path: $expected_value vs. $value" + fi +} diff --git a/tools/testing/selftests/livepatch/test-sysfs.sh b/tools/testing/selftests/livepatch/test-sysfs.sh new file mode 100755 index 000000000000..7f76f280189a --- /dev/null +++ b/tools/testing/selftests/livepatch/test-sysfs.sh @@ -0,0 +1,86 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2022 Song Liu <song@kernel.org> + +. $(dirname $0)/functions.sh + +MOD_LIVEPATCH=test_klp_livepatch + +setup_config + +# - load a livepatch and verifies the sysfs entries work as expected + +start_test "sysfs test" + +load_lp $MOD_LIVEPATCH + +check_sysfs_rights "$MOD_LIVEPATCH" "" "drwxr-xr-x" +check_sysfs_rights "$MOD_LIVEPATCH" "enabled" "-rw-r--r--" +check_sysfs_value "$MOD_LIVEPATCH" "enabled" "1" +check_sysfs_rights "$MOD_LIVEPATCH" "force" "--w-------" +check_sysfs_rights "$MOD_LIVEPATCH" "transition" "-r--r--r--" +check_sysfs_value "$MOD_LIVEPATCH" "transition" "0" +check_sysfs_rights "$MOD_LIVEPATCH" "vmlinux/patched" "-r--r--r--" +check_sysfs_value "$MOD_LIVEPATCH" "vmlinux/patched" "1" + +disable_lp $MOD_LIVEPATCH + +unload_lp $MOD_LIVEPATCH + +check_result "% modprobe $MOD_LIVEPATCH +livepatch: enabling patch '$MOD_LIVEPATCH' +livepatch: '$MOD_LIVEPATCH': initializing patching transition +livepatch: '$MOD_LIVEPATCH': starting patching transition +livepatch: '$MOD_LIVEPATCH': completing patching transition +livepatch: '$MOD_LIVEPATCH': patching complete +% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled +livepatch: '$MOD_LIVEPATCH': initializing unpatching transition +livepatch: '$MOD_LIVEPATCH': starting unpatching transition +livepatch: '$MOD_LIVEPATCH': completing unpatching transition +livepatch: '$MOD_LIVEPATCH': unpatching complete +% rmmod $MOD_LIVEPATCH" + +start_test "sysfs test object/patched" + +MOD_LIVEPATCH=test_klp_callbacks_demo +MOD_TARGET=test_klp_callbacks_mod +load_lp $MOD_LIVEPATCH + +# check the "patch" file changes as target module loads/unloads +check_sysfs_value "$MOD_LIVEPATCH" "$MOD_TARGET/patched" "0" +load_mod $MOD_TARGET +check_sysfs_value "$MOD_LIVEPATCH" "$MOD_TARGET/patched" "1" +unload_mod $MOD_TARGET +check_sysfs_value "$MOD_LIVEPATCH" "$MOD_TARGET/patched" "0" + +disable_lp $MOD_LIVEPATCH +unload_lp $MOD_LIVEPATCH + +check_result "% modprobe test_klp_callbacks_demo +livepatch: enabling patch 'test_klp_callbacks_demo' +livepatch: 'test_klp_callbacks_demo': initializing patching transition +test_klp_callbacks_demo: pre_patch_callback: vmlinux +livepatch: 'test_klp_callbacks_demo': starting patching transition +livepatch: 'test_klp_callbacks_demo': completing patching transition +test_klp_callbacks_demo: post_patch_callback: vmlinux +livepatch: 'test_klp_callbacks_demo': patching complete +% modprobe test_klp_callbacks_mod +livepatch: applying patch 'test_klp_callbacks_demo' to loading module 'test_klp_callbacks_mod' +test_klp_callbacks_demo: pre_patch_callback: test_klp_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init +test_klp_callbacks_demo: post_patch_callback: test_klp_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init +test_klp_callbacks_mod: test_klp_callbacks_mod_init +% rmmod test_klp_callbacks_mod +test_klp_callbacks_mod: test_klp_callbacks_mod_exit +test_klp_callbacks_demo: pre_unpatch_callback: test_klp_callbacks_mod -> [MODULE_STATE_GOING] Going away +livepatch: reverting patch 'test_klp_callbacks_demo' on unloading module 'test_klp_callbacks_mod' +test_klp_callbacks_demo: post_unpatch_callback: test_klp_callbacks_mod -> [MODULE_STATE_GOING] Going away +% echo 0 > /sys/kernel/livepatch/test_klp_callbacks_demo/enabled +livepatch: 'test_klp_callbacks_demo': initializing unpatching transition +test_klp_callbacks_demo: pre_unpatch_callback: vmlinux +livepatch: 'test_klp_callbacks_demo': starting unpatching transition +livepatch: 'test_klp_callbacks_demo': completing unpatching transition +test_klp_callbacks_demo: post_unpatch_callback: vmlinux +livepatch: 'test_klp_callbacks_demo': unpatching complete +% rmmod test_klp_callbacks_demo" + +exit 0 diff --git a/tools/testing/selftests/memory-hotplug/mem-on-off-test.sh b/tools/testing/selftests/memory-hotplug/mem-on-off-test.sh index 46a97f318f58..74ee5067a8ce 100755 --- a/tools/testing/selftests/memory-hotplug/mem-on-off-test.sh +++ b/tools/testing/selftests/memory-hotplug/mem-on-off-test.sh @@ -134,6 +134,16 @@ offline_memory_expect_fail() return 0 } +online_all_offline_memory() +{ + for memory in `hotpluggable_offline_memory`; do + if ! online_memory_expect_success $memory; then + echo "$FUNCNAME $memory: unexpected fail" >&2 + retval=1 + fi + done +} + error=-12 priority=0 # Run with default of ratio=2 for Kselftest run @@ -197,8 +207,11 @@ echo -e "\t trying to offline $target out of $hotpluggable_num memory block(s):" for memory in `hotpluggable_online_memory`; do if [ "$target" -gt 0 ]; then echo "online->offline memory$memory" - if offline_memory_expect_success $memory; then + if offline_memory_expect_success $memory &>/dev/null; then target=$(($target - 1)) + echo "-> Success" + else + echo "-> Failure" fi fi done @@ -257,7 +270,7 @@ prerequisite_extra echo 0 > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_OFFLINE/error for memory in `hotpluggable_online_memory`; do if [ $((RANDOM % 100)) -lt $ratio ]; then - offline_memory_expect_success $memory + offline_memory_expect_success $memory &>/dev/null fi done @@ -266,16 +279,16 @@ done # echo $error > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_ONLINE/error for memory in `hotpluggable_offline_memory`; do - online_memory_expect_fail $memory + if ! online_memory_expect_fail $memory; then + retval=1 + fi done # # Online all hot-pluggable memory # echo 0 > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_ONLINE/error -for memory in `hotpluggable_offline_memory`; do - online_memory_expect_success $memory -done +online_all_offline_memory # # Test memory hot-remove error handling (online => offline) @@ -283,11 +296,18 @@ done echo $error > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_OFFLINE/error for memory in `hotpluggable_online_memory`; do if [ $((RANDOM % 100)) -lt $ratio ]; then - offline_memory_expect_fail $memory + if ! offline_memory_expect_fail $memory; then + retval=1 + fi fi done echo 0 > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_OFFLINE/error /sbin/modprobe -q -r memory-notifier-error-inject +# +# Restore memory before exit +# +online_all_offline_memory + exit $retval diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 2a6b0bc648c4..69c58362c0ed 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -70,6 +70,7 @@ TEST_PROGS += io_uring_zerocopy_tx.sh TEST_GEN_FILES += bind_bhash TEST_GEN_PROGS += sk_bind_sendto_listen TEST_GEN_PROGS += sk_connect_zero_addr +TEST_PROGS += test_ingress_egress_chaining.sh TEST_FILES := settings diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index d5a0dd548989..ee5e98204d3d 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -1223,6 +1223,11 @@ ipv4_fcnal() log_test $rc 0 "Delete nexthop route warning" run_cmd "$IP route delete 172.16.101.1/32 nhid 12" run_cmd "$IP nexthop del id 12" + + run_cmd "$IP nexthop add id 21 via 172.16.1.6 dev veth1" + run_cmd "$IP ro add 172.16.101.0/24 nhid 21" + run_cmd "$IP ro del 172.16.101.0/24 nexthop via 172.16.1.7 dev veth1 nexthop via 172.16.1.8 dev veth1" + log_test $? 2 "Delete multipath route with only nh id based entry" } ipv4_grp_fcnal() diff --git a/tools/testing/selftests/net/test_ingress_egress_chaining.sh b/tools/testing/selftests/net/test_ingress_egress_chaining.sh new file mode 100644 index 000000000000..08adff6bb3b6 --- /dev/null +++ b/tools/testing/selftests/net/test_ingress_egress_chaining.sh @@ -0,0 +1,79 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This test runs a simple ingress tc setup between two veth pairs, +# and chains a single egress rule to test ingress chaining to egress. +# +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +if [ "$(id -u)" -ne 0 ];then + echo "SKIP: Need root privileges" + exit $ksft_skip +fi + +needed_mods="act_mirred cls_flower sch_ingress" +for mod in $needed_mods; do + modinfo $mod &>/dev/null || { echo "SKIP: Need act_mirred module"; exit $ksft_skip; } +done + +ns="ns$((RANDOM%899+100))" +veth1="veth1$((RANDOM%899+100))" +veth2="veth2$((RANDOM%899+100))" +peer1="peer1$((RANDOM%899+100))" +peer2="peer2$((RANDOM%899+100))" +ip_peer1=198.51.100.5 +ip_peer2=198.51.100.6 + +function fail() { + echo "FAIL: $@" >> /dev/stderr + exit 1 +} + +function cleanup() { + killall -q -9 udpgso_bench_rx + ip link del $veth1 &> /dev/null + ip link del $veth2 &> /dev/null + ip netns del $ns &> /dev/null +} +trap cleanup EXIT + +function config() { + echo "Setup veth pairs [$veth1, $peer1], and veth pair [$veth2, $peer2]" + ip link add $veth1 type veth peer name $peer1 + ip link add $veth2 type veth peer name $peer2 + ip addr add $ip_peer1/24 dev $peer1 + ip link set $peer1 up + ip netns add $ns + ip link set dev $peer2 netns $ns + ip netns exec $ns ip addr add $ip_peer2/24 dev $peer2 + ip netns exec $ns ip link set $peer2 up + ip link set $veth1 up + ip link set $veth2 up + + echo "Add tc filter ingress->egress forwarding $veth1 <-> $veth2" + tc qdisc add dev $veth2 ingress + tc qdisc add dev $veth1 ingress + tc filter add dev $veth2 ingress prio 1 proto all flower \ + action mirred egress redirect dev $veth1 + tc filter add dev $veth1 ingress prio 1 proto all flower \ + action mirred egress redirect dev $veth2 + + echo "Add tc filter egress->ingress forwarding $peer1 -> $veth1, bypassing the veth pipe" + tc qdisc add dev $peer1 clsact + tc filter add dev $peer1 egress prio 20 proto ip flower \ + action mirred ingress redirect dev $veth1 +} + +function test_run() { + echo "Run tcp traffic" + ./udpgso_bench_rx -t & + sleep 1 + ip netns exec $ns timeout -k 2 10 ./udpgso_bench_tx -t -l 2 -4 -D $ip_peer1 || fail "traffic failed" + echo "Test passed" +} + +config +test_run +trap - EXIT +cleanup diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile index 600e3a19d5e2..4504ee07be08 100644 --- a/tools/testing/selftests/netfilter/Makefile +++ b/tools/testing/selftests/netfilter/Makefile @@ -6,7 +6,7 @@ TEST_PROGS := nft_trans_stress.sh nft_fib.sh nft_nat.sh bridge_brouter.sh \ nft_concat_range.sh nft_conntrack_helper.sh \ nft_queue.sh nft_meta.sh nf_nat_edemux.sh \ ipip-conntrack-mtu.sh conntrack_tcp_unreplied.sh \ - conntrack_vrf.sh nft_synproxy.sh + conntrack_vrf.sh nft_synproxy.sh rpath.sh CFLAGS += $(shell pkg-config --cflags libmnl 2>/dev/null || echo "-I/usr/include/libmnl") LDLIBS = -lmnl diff --git a/tools/testing/selftests/netfilter/nft_fib.sh b/tools/testing/selftests/netfilter/nft_fib.sh index fd76b69635a4..dff476e45e77 100755 --- a/tools/testing/selftests/netfilter/nft_fib.sh +++ b/tools/testing/selftests/netfilter/nft_fib.sh @@ -188,6 +188,7 @@ test_ping() { ip netns exec ${nsrouter} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null +ip netns exec ${nsrouter} sysctl net.ipv4.conf.all.rp_filter=0 > /dev/null ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.rp_filter=0 > /dev/null sleep 3 diff --git a/tools/testing/selftests/netfilter/rpath.sh b/tools/testing/selftests/netfilter/rpath.sh new file mode 100755 index 000000000000..2d8da7bd8ab7 --- /dev/null +++ b/tools/testing/selftests/netfilter/rpath.sh @@ -0,0 +1,147 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# return code to signal skipped test +ksft_skip=4 + +# search for legacy iptables (it uses the xtables extensions +if iptables-legacy --version >/dev/null 2>&1; then + iptables='iptables-legacy' +elif iptables --version >/dev/null 2>&1; then + iptables='iptables' +else + iptables='' +fi + +if ip6tables-legacy --version >/dev/null 2>&1; then + ip6tables='ip6tables-legacy' +elif ! ip6tables --version >/dev/null 2>&1; then + ip6tables='ip6tables' +else + ip6tables='' +fi + +if nft --version >/dev/null 2>&1; then + nft='nft' +else + nft='' +fi + +if [ -z "$iptables$ip6tables$nft" ]; then + echo "SKIP: Test needs iptables, ip6tables or nft" + exit $ksft_skip +fi + +sfx=$(mktemp -u "XXXXXXXX") +ns1="ns1-$sfx" +ns2="ns2-$sfx" +trap "ip netns del $ns1; ip netns del $ns2" EXIT + +# create two netns, disable rp_filter in ns2 and +# keep IPv6 address when moving into VRF +ip netns add "$ns1" +ip netns add "$ns2" +ip netns exec "$ns2" sysctl -q net.ipv4.conf.all.rp_filter=0 +ip netns exec "$ns2" sysctl -q net.ipv4.conf.default.rp_filter=0 +ip netns exec "$ns2" sysctl -q net.ipv6.conf.all.keep_addr_on_down=1 + +# a standard connection between the netns, should not trigger rp filter +ip -net "$ns1" link add v0 type veth peer name v0 netns "$ns2" +ip -net "$ns1" link set v0 up; ip -net "$ns2" link set v0 up +ip -net "$ns1" a a 192.168.23.2/24 dev v0 +ip -net "$ns2" a a 192.168.23.1/24 dev v0 +ip -net "$ns1" a a fec0:23::2/64 dev v0 nodad +ip -net "$ns2" a a fec0:23::1/64 dev v0 nodad + +# rp filter testing: ns1 sends packets via v0 which ns2 would route back via d0 +ip -net "$ns2" link add d0 type dummy +ip -net "$ns2" link set d0 up +ip -net "$ns1" a a 192.168.42.2/24 dev v0 +ip -net "$ns2" a a 192.168.42.1/24 dev d0 +ip -net "$ns1" a a fec0:42::2/64 dev v0 nodad +ip -net "$ns2" a a fec0:42::1/64 dev d0 nodad + +# firewall matches to test +ip netns exec "$ns2" "$iptables" -t raw -A PREROUTING -s 192.168.0.0/16 -m rpfilter +ip netns exec "$ns2" "$ip6tables" -t raw -A PREROUTING -s fec0::/16 -m rpfilter +ip netns exec "$ns2" nft -f - <<EOF +table inet t { + chain c { + type filter hook prerouting priority raw; + ip saddr 192.168.0.0/16 fib saddr . iif oif exists counter + ip6 saddr fec0::/16 fib saddr . iif oif exists counter + } +} +EOF + +die() { + echo "FAIL: $*" + #ip netns exec "$ns2" "$iptables" -t raw -vS + #ip netns exec "$ns2" "$ip6tables" -t raw -vS + #ip netns exec "$ns2" nft list ruleset + exit 1 +} + +# check rule counters, return true if rule did not match +ipt_zero_rule() { # (command) + [ -n "$1" ] || return 0 + ip netns exec "$ns2" "$1" -t raw -vS | grep -q -- "-m rpfilter -c 0 0" +} +nft_zero_rule() { # (family) + [ -n "$nft" ] || return 0 + ip netns exec "$ns2" "$nft" list chain inet t c | \ + grep -q "$1 saddr .* counter packets 0 bytes 0" +} + +netns_ping() { # (netns, args...) + local netns="$1" + shift + ip netns exec "$netns" ping -q -c 1 -W 1 "$@" >/dev/null +} + +testrun() { + # clear counters first + [ -n "$iptables" ] && ip netns exec "$ns2" "$iptables" -t raw -Z + [ -n "$ip6tables" ] && ip netns exec "$ns2" "$ip6tables" -t raw -Z + if [ -n "$nft" ]; then + ( + echo "delete table inet t"; + ip netns exec "$ns2" nft -s list table inet t; + ) | ip netns exec "$ns2" nft -f - + fi + + # test 1: martian traffic should fail rpfilter matches + netns_ping "$ns1" -I v0 192.168.42.1 && \ + die "martian ping 192.168.42.1 succeeded" + netns_ping "$ns1" -I v0 fec0:42::1 && \ + die "martian ping fec0:42::1 succeeded" + + ipt_zero_rule "$iptables" || die "iptables matched martian" + ipt_zero_rule "$ip6tables" || die "ip6tables matched martian" + nft_zero_rule ip || die "nft IPv4 matched martian" + nft_zero_rule ip6 || die "nft IPv6 matched martian" + + # test 2: rpfilter match should pass for regular traffic + netns_ping "$ns1" 192.168.23.1 || \ + die "regular ping 192.168.23.1 failed" + netns_ping "$ns1" fec0:23::1 || \ + die "regular ping fec0:23::1 failed" + + ipt_zero_rule "$iptables" && die "iptables match not effective" + ipt_zero_rule "$ip6tables" && die "ip6tables match not effective" + nft_zero_rule ip && die "nft IPv4 match not effective" + nft_zero_rule ip6 && die "nft IPv6 match not effective" + +} + +testrun + +# repeat test with vrf device in $ns2 +ip -net "$ns2" link add vrf0 type vrf table 10 +ip -net "$ns2" link set vrf0 up +ip -net "$ns2" link set v0 master vrf0 + +testrun + +echo "PASS: netfilter reverse path match works as intended" +exit 0 diff --git a/tools/testing/selftests/powerpc/benchmarks/gettimeofday.c b/tools/testing/selftests/powerpc/benchmarks/gettimeofday.c index 6b415683357b..580fcac0a09f 100644 --- a/tools/testing/selftests/powerpc/benchmarks/gettimeofday.c +++ b/tools/testing/selftests/powerpc/benchmarks/gettimeofday.c @@ -12,7 +12,7 @@ static int test_gettimeofday(void) { int i; - struct timeval tv_start, tv_end; + struct timeval tv_start, tv_end, tv_diff; gettimeofday(&tv_start, NULL); @@ -20,7 +20,9 @@ static int test_gettimeofday(void) gettimeofday(&tv_end, NULL); } - printf("time = %.6f\n", tv_end.tv_sec - tv_start.tv_sec + (tv_end.tv_usec - tv_start.tv_usec) * 1e-6); + timersub(&tv_start, &tv_end, &tv_diff); + + printf("time = %.6f\n", tv_diff.tv_sec + (tv_diff.tv_usec) * 1e-6); return 0; } diff --git a/tools/testing/selftests/powerpc/mm/Makefile b/tools/testing/selftests/powerpc/mm/Makefile index 27dc09d0bfee..19dd0b2ea397 100644 --- a/tools/testing/selftests/powerpc/mm/Makefile +++ b/tools/testing/selftests/powerpc/mm/Makefile @@ -3,7 +3,7 @@ noarg: $(MAKE) -C ../ TEST_GEN_PROGS := hugetlb_vs_thp_test subpage_prot prot_sao segv_errors wild_bctr \ - large_vm_fork_separation bad_accesses pkey_exec_prot \ + large_vm_fork_separation bad_accesses exec_prot pkey_exec_prot \ pkey_siginfo stack_expansion_signal stack_expansion_ldst \ large_vm_gpr_corruption TEST_PROGS := stress_code_patching.sh @@ -22,6 +22,7 @@ $(OUTPUT)/wild_bctr: CFLAGS += -m64 $(OUTPUT)/large_vm_fork_separation: CFLAGS += -m64 $(OUTPUT)/large_vm_gpr_corruption: CFLAGS += -m64 $(OUTPUT)/bad_accesses: CFLAGS += -m64 +$(OUTPUT)/exec_prot: CFLAGS += -m64 $(OUTPUT)/pkey_exec_prot: CFLAGS += -m64 $(OUTPUT)/pkey_siginfo: CFLAGS += -m64 diff --git a/tools/testing/selftests/powerpc/mm/exec_prot.c b/tools/testing/selftests/powerpc/mm/exec_prot.c new file mode 100644 index 000000000000..db75b2225de1 --- /dev/null +++ b/tools/testing/selftests/powerpc/mm/exec_prot.c @@ -0,0 +1,231 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2022, Nicholas Miehlbradt, IBM Corporation + * based on pkey_exec_prot.c + * + * Test if applying execute protection on pages works as expected. + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <signal.h> + +#include <unistd.h> +#include <sys/mman.h> + +#include "pkeys.h" + + +#define PPC_INST_NOP 0x60000000 +#define PPC_INST_TRAP 0x7fe00008 +#define PPC_INST_BLR 0x4e800020 + +static volatile sig_atomic_t fault_code; +static volatile sig_atomic_t remaining_faults; +static volatile unsigned int *fault_addr; +static unsigned long pgsize, numinsns; +static unsigned int *insns; +static bool pkeys_supported; + +static bool is_fault_expected(int fault_code) +{ + if (fault_code == SEGV_ACCERR) + return true; + + /* Assume any pkey error is fine since pkey_exec_prot test covers them */ + if (fault_code == SEGV_PKUERR && pkeys_supported) + return true; + + return false; +} + +static void trap_handler(int signum, siginfo_t *sinfo, void *ctx) +{ + /* Check if this fault originated from the expected address */ + if (sinfo->si_addr != (void *)fault_addr) + sigsafe_err("got a fault for an unexpected address\n"); + + _exit(1); +} + +static void segv_handler(int signum, siginfo_t *sinfo, void *ctx) +{ + fault_code = sinfo->si_code; + + /* Check if this fault originated from the expected address */ + if (sinfo->si_addr != (void *)fault_addr) { + sigsafe_err("got a fault for an unexpected address\n"); + _exit(1); + } + + /* Check if too many faults have occurred for a single test case */ + if (!remaining_faults) { + sigsafe_err("got too many faults for the same address\n"); + _exit(1); + } + + + /* Restore permissions in order to continue */ + if (is_fault_expected(fault_code)) { + if (mprotect(insns, pgsize, PROT_READ | PROT_WRITE | PROT_EXEC)) { + sigsafe_err("failed to set access permissions\n"); + _exit(1); + } + } else { + sigsafe_err("got a fault with an unexpected code\n"); + _exit(1); + } + + remaining_faults--; +} + +static int check_exec_fault(int rights) +{ + /* + * Jump to the executable region. + * + * The first iteration also checks if the overwrite of the + * first instruction word from a trap to a no-op succeeded. + */ + fault_code = -1; + remaining_faults = 0; + if (!(rights & PROT_EXEC)) + remaining_faults = 1; + + FAIL_IF(mprotect(insns, pgsize, rights) != 0); + asm volatile("mtctr %0; bctrl" : : "r"(insns)); + + FAIL_IF(remaining_faults != 0); + if (!(rights & PROT_EXEC)) + FAIL_IF(!is_fault_expected(fault_code)); + + return 0; +} + +static int test(void) +{ + struct sigaction segv_act, trap_act; + int i; + + /* Skip the test if the CPU doesn't support Radix */ + SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_3_00)); + + /* Check if pkeys are supported */ + pkeys_supported = pkeys_unsupported() == 0; + + /* Setup SIGSEGV handler */ + segv_act.sa_handler = 0; + segv_act.sa_sigaction = segv_handler; + FAIL_IF(sigprocmask(SIG_SETMASK, 0, &segv_act.sa_mask) != 0); + segv_act.sa_flags = SA_SIGINFO; + segv_act.sa_restorer = 0; + FAIL_IF(sigaction(SIGSEGV, &segv_act, NULL) != 0); + + /* Setup SIGTRAP handler */ + trap_act.sa_handler = 0; + trap_act.sa_sigaction = trap_handler; + FAIL_IF(sigprocmask(SIG_SETMASK, 0, &trap_act.sa_mask) != 0); + trap_act.sa_flags = SA_SIGINFO; + trap_act.sa_restorer = 0; + FAIL_IF(sigaction(SIGTRAP, &trap_act, NULL) != 0); + + /* Setup executable region */ + pgsize = getpagesize(); + numinsns = pgsize / sizeof(unsigned int); + insns = (unsigned int *)mmap(NULL, pgsize, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + FAIL_IF(insns == MAP_FAILED); + + /* Write the instruction words */ + for (i = 1; i < numinsns - 1; i++) + insns[i] = PPC_INST_NOP; + + /* + * Set the first instruction as an unconditional trap. If + * the last write to this address succeeds, this should + * get overwritten by a no-op. + */ + insns[0] = PPC_INST_TRAP; + + /* + * Later, to jump to the executable region, we use a branch + * and link instruction (bctrl) which sets the return address + * automatically in LR. Use that to return back. + */ + insns[numinsns - 1] = PPC_INST_BLR; + + /* + * Pick the first instruction's address from the executable + * region. + */ + fault_addr = insns; + + /* + * Read an instruction word from the address when the page + * is execute only. This should generate an access fault. + */ + fault_code = -1; + remaining_faults = 1; + printf("Testing read on --x, should fault..."); + FAIL_IF(mprotect(insns, pgsize, PROT_EXEC) != 0); + i = *fault_addr; + FAIL_IF(remaining_faults != 0 || !is_fault_expected(fault_code)); + printf("ok!\n"); + + /* + * Write an instruction word to the address when the page + * execute only. This should also generate an access fault. + */ + fault_code = -1; + remaining_faults = 1; + printf("Testing write on --x, should fault..."); + FAIL_IF(mprotect(insns, pgsize, PROT_EXEC) != 0); + *fault_addr = PPC_INST_NOP; + FAIL_IF(remaining_faults != 0 || !is_fault_expected(fault_code)); + printf("ok!\n"); + + printf("Testing exec on ---, should fault..."); + FAIL_IF(check_exec_fault(PROT_NONE)); + printf("ok!\n"); + + printf("Testing exec on r--, should fault..."); + FAIL_IF(check_exec_fault(PROT_READ)); + printf("ok!\n"); + + printf("Testing exec on -w-, should fault..."); + FAIL_IF(check_exec_fault(PROT_WRITE)); + printf("ok!\n"); + + printf("Testing exec on rw-, should fault..."); + FAIL_IF(check_exec_fault(PROT_READ | PROT_WRITE)); + printf("ok!\n"); + + printf("Testing exec on --x, should succeed..."); + FAIL_IF(check_exec_fault(PROT_EXEC)); + printf("ok!\n"); + + printf("Testing exec on r-x, should succeed..."); + FAIL_IF(check_exec_fault(PROT_READ | PROT_EXEC)); + printf("ok!\n"); + + printf("Testing exec on -wx, should succeed..."); + FAIL_IF(check_exec_fault(PROT_WRITE | PROT_EXEC)); + printf("ok!\n"); + + printf("Testing exec on rwx, should succeed..."); + FAIL_IF(check_exec_fault(PROT_READ | PROT_WRITE | PROT_EXEC)); + printf("ok!\n"); + + /* Cleanup */ + FAIL_IF(munmap((void *)insns, pgsize)); + + return 0; +} + +int main(void) +{ + return test_harness(test, "exec_prot"); +} diff --git a/tools/testing/selftests/powerpc/mm/large_vm_gpr_corruption.c b/tools/testing/selftests/powerpc/mm/large_vm_gpr_corruption.c index 927bfae99ed9..7da515f1da72 100644 --- a/tools/testing/selftests/powerpc/mm/large_vm_gpr_corruption.c +++ b/tools/testing/selftests/powerpc/mm/large_vm_gpr_corruption.c @@ -112,6 +112,8 @@ static int test(void) // This tests a hash MMU specific bug. FAIL_IF(using_hash_mmu(&hash_mmu)); SKIP_IF(!hash_mmu); + // 4K kernels don't support 4PB address space + SKIP_IF(sysconf(_SC_PAGESIZE) < 65536); page_size = sysconf(_SC_PAGESIZE); diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/bhrb_filter_map_test.c b/tools/testing/selftests/powerpc/pmu/sampling_tests/bhrb_filter_map_test.c index 8182647c63c8..3f43c315c666 100644 --- a/tools/testing/selftests/powerpc/pmu/sampling_tests/bhrb_filter_map_test.c +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/bhrb_filter_map_test.c @@ -96,6 +96,15 @@ static int bhrb_filter_map_test(void) } } + /* + * Combine filter maps which includes a valid branch filter and an invalid branch + * filter. Example: any ( PERF_SAMPLE_BRANCH_ANY) and any_call + * (PERF_SAMPLE_BRANCH_ANY_CALL). + * The perf_event_open should fail in this case. + */ + event.attr.branch_sample_type = PERF_SAMPLE_BRANCH_ANY | PERF_SAMPLE_BRANCH_ANY_CALL; + FAIL_IF(!event_open(&event)); + return 0; } diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore index c4e6a34f9657..a156ac5dd2c6 100644 --- a/tools/testing/selftests/proc/.gitignore +++ b/tools/testing/selftests/proc/.gitignore @@ -5,6 +5,7 @@ /proc-fsconfig-hidepid /proc-loadavg-001 /proc-multiple-procfs +/proc-empty-vm /proc-pid-vm /proc-self-map-files-001 /proc-self-map-files-002 diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile index 219fc6113847..cd95369254c0 100644 --- a/tools/testing/selftests/proc/Makefile +++ b/tools/testing/selftests/proc/Makefile @@ -8,6 +8,7 @@ TEST_GEN_PROGS += fd-001-lookup TEST_GEN_PROGS += fd-002-posix-eq TEST_GEN_PROGS += fd-003-kthread TEST_GEN_PROGS += proc-loadavg-001 +TEST_GEN_PROGS += proc-empty-vm TEST_GEN_PROGS += proc-pid-vm TEST_GEN_PROGS += proc-self-map-files-001 TEST_GEN_PROGS += proc-self-map-files-002 diff --git a/tools/testing/selftests/proc/proc-empty-vm.c b/tools/testing/selftests/proc/proc-empty-vm.c new file mode 100644 index 000000000000..d95b1cb43d9d --- /dev/null +++ b/tools/testing/selftests/proc/proc-empty-vm.c @@ -0,0 +1,386 @@ +/* + * Copyright (c) 2022 Alexey Dobriyan <adobriyan@gmail.com> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +/* + * Create a process without mappings by unmapping everything at once and + * holding it with ptrace(2). See what happens to + * + * /proc/${pid}/maps + * /proc/${pid}/numa_maps + * /proc/${pid}/smaps + * /proc/${pid}/smaps_rollup + */ +#undef NDEBUG +#include <assert.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/ptrace.h> +#include <sys/resource.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> + +/* + * 0: vsyscall VMA doesn't exist vsyscall=none + * 1: vsyscall VMA is --xp vsyscall=xonly + * 2: vsyscall VMA is r-xp vsyscall=emulate + */ +static int g_vsyscall; +static const char *g_proc_pid_maps_vsyscall; +static const char *g_proc_pid_smaps_vsyscall; + +static const char proc_pid_maps_vsyscall_0[] = ""; +static const char proc_pid_maps_vsyscall_1[] = +"ffffffffff600000-ffffffffff601000 --xp 00000000 00:00 0 [vsyscall]\n"; +static const char proc_pid_maps_vsyscall_2[] = +"ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]\n"; + +static const char proc_pid_smaps_vsyscall_0[] = ""; + +static const char proc_pid_smaps_vsyscall_1[] = +"ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]\n" +"Size: 4 kB\n" +"KernelPageSize: 4 kB\n" +"MMUPageSize: 4 kB\n" +"Rss: 0 kB\n" +"Pss: 0 kB\n" +"Pss_Dirty: 0 kB\n" +"Shared_Clean: 0 kB\n" +"Shared_Dirty: 0 kB\n" +"Private_Clean: 0 kB\n" +"Private_Dirty: 0 kB\n" +"Referenced: 0 kB\n" +"Anonymous: 0 kB\n" +"LazyFree: 0 kB\n" +"AnonHugePages: 0 kB\n" +"ShmemPmdMapped: 0 kB\n" +"FilePmdMapped: 0 kB\n" +"Shared_Hugetlb: 0 kB\n" +"Private_Hugetlb: 0 kB\n" +"Swap: 0 kB\n" +"SwapPss: 0 kB\n" +"Locked: 0 kB\n" +"THPeligible: 0\n" +/* + * "ProtectionKey:" field is conditional. It is possible to check it as well, + * but I don't have such machine. + */ +; + +static const char proc_pid_smaps_vsyscall_2[] = +"ffffffffff600000-ffffffffff601000 --xp 00000000 00:00 0 [vsyscall]\n" +"Size: 4 kB\n" +"KernelPageSize: 4 kB\n" +"MMUPageSize: 4 kB\n" +"Rss: 0 kB\n" +"Pss: 0 kB\n" +"Pss_Dirty: 0 kB\n" +"Shared_Clean: 0 kB\n" +"Shared_Dirty: 0 kB\n" +"Private_Clean: 0 kB\n" +"Private_Dirty: 0 kB\n" +"Referenced: 0 kB\n" +"Anonymous: 0 kB\n" +"LazyFree: 0 kB\n" +"AnonHugePages: 0 kB\n" +"ShmemPmdMapped: 0 kB\n" +"FilePmdMapped: 0 kB\n" +"Shared_Hugetlb: 0 kB\n" +"Private_Hugetlb: 0 kB\n" +"Swap: 0 kB\n" +"SwapPss: 0 kB\n" +"Locked: 0 kB\n" +"THPeligible: 0\n" +/* + * "ProtectionKey:" field is conditional. It is possible to check it as well, + * but I'm too tired. + */ +; + +static void sigaction_SIGSEGV(int _, siginfo_t *__, void *___) +{ + _exit(EXIT_FAILURE); +} + +static void sigaction_SIGSEGV_vsyscall(int _, siginfo_t *__, void *___) +{ + _exit(g_vsyscall); +} + +/* + * vsyscall page can't be unmapped, probe it directly. + */ +static void vsyscall(void) +{ + pid_t pid; + int wstatus; + + pid = fork(); + if (pid < 0) { + fprintf(stderr, "fork, errno %d\n", errno); + exit(1); + } + if (pid == 0) { + setrlimit(RLIMIT_CORE, &(struct rlimit){}); + + /* Hide "segfault at ffffffffff600000" messages. */ + struct sigaction act = {}; + act.sa_flags = SA_SIGINFO; + act.sa_sigaction = sigaction_SIGSEGV_vsyscall; + sigaction(SIGSEGV, &act, NULL); + + g_vsyscall = 0; + /* gettimeofday(NULL, NULL); */ + asm volatile ( + "call %P0" + : + : "i" (0xffffffffff600000), "D" (NULL), "S" (NULL) + : "rax", "rcx", "r11" + ); + + g_vsyscall = 1; + *(volatile int *)0xffffffffff600000UL; + + g_vsyscall = 2; + exit(g_vsyscall); + } + waitpid(pid, &wstatus, 0); + if (WIFEXITED(wstatus)) { + g_vsyscall = WEXITSTATUS(wstatus); + } else { + fprintf(stderr, "error: vsyscall wstatus %08x\n", wstatus); + exit(1); + } +} + +static int test_proc_pid_maps(pid_t pid) +{ + char buf[4096]; + snprintf(buf, sizeof(buf), "/proc/%u/maps", pid); + int fd = open(buf, O_RDONLY); + if (fd == -1) { + perror("open /proc/${pid}/maps"); + return EXIT_FAILURE; + } else { + ssize_t rv = read(fd, buf, sizeof(buf)); + close(fd); + if (g_vsyscall == 0) { + assert(rv == 0); + } else { + size_t len = strlen(g_proc_pid_maps_vsyscall); + assert(rv == len); + assert(memcmp(buf, g_proc_pid_maps_vsyscall, len) == 0); + } + return EXIT_SUCCESS; + } +} + +static int test_proc_pid_numa_maps(pid_t pid) +{ + char buf[4096]; + snprintf(buf, sizeof(buf), "/proc/%u/numa_maps", pid); + int fd = open(buf, O_RDONLY); + if (fd == -1) { + if (errno == ENOENT) { + /* + * /proc/${pid}/numa_maps is under CONFIG_NUMA, + * it doesn't necessarily exist. + */ + return EXIT_SUCCESS; + } + perror("open /proc/${pid}/numa_maps"); + return EXIT_FAILURE; + } else { + ssize_t rv = read(fd, buf, sizeof(buf)); + close(fd); + assert(rv == 0); + return EXIT_SUCCESS; + } +} + +static int test_proc_pid_smaps(pid_t pid) +{ + char buf[4096]; + snprintf(buf, sizeof(buf), "/proc/%u/smaps", pid); + int fd = open(buf, O_RDONLY); + if (fd == -1) { + if (errno == ENOENT) { + /* + * /proc/${pid}/smaps is under CONFIG_PROC_PAGE_MONITOR, + * it doesn't necessarily exist. + */ + return EXIT_SUCCESS; + } + perror("open /proc/${pid}/smaps"); + return EXIT_FAILURE; + } else { + ssize_t rv = read(fd, buf, sizeof(buf)); + close(fd); + if (g_vsyscall == 0) { + assert(rv == 0); + } else { + size_t len = strlen(g_proc_pid_maps_vsyscall); + /* TODO "ProtectionKey:" */ + assert(rv > len); + assert(memcmp(buf, g_proc_pid_maps_vsyscall, len) == 0); + } + return EXIT_SUCCESS; + } +} + +static const char g_smaps_rollup[] = +"00000000-00000000 ---p 00000000 00:00 0 [rollup]\n" +"Rss: 0 kB\n" +"Pss: 0 kB\n" +"Pss_Dirty: 0 kB\n" +"Pss_Anon: 0 kB\n" +"Pss_File: 0 kB\n" +"Pss_Shmem: 0 kB\n" +"Shared_Clean: 0 kB\n" +"Shared_Dirty: 0 kB\n" +"Private_Clean: 0 kB\n" +"Private_Dirty: 0 kB\n" +"Referenced: 0 kB\n" +"Anonymous: 0 kB\n" +"LazyFree: 0 kB\n" +"AnonHugePages: 0 kB\n" +"ShmemPmdMapped: 0 kB\n" +"FilePmdMapped: 0 kB\n" +"Shared_Hugetlb: 0 kB\n" +"Private_Hugetlb: 0 kB\n" +"Swap: 0 kB\n" +"SwapPss: 0 kB\n" +"Locked: 0 kB\n" +; + +static int test_proc_pid_smaps_rollup(pid_t pid) +{ + char buf[4096]; + snprintf(buf, sizeof(buf), "/proc/%u/smaps_rollup", pid); + int fd = open(buf, O_RDONLY); + if (fd == -1) { + if (errno == ENOENT) { + /* + * /proc/${pid}/smaps_rollup is under CONFIG_PROC_PAGE_MONITOR, + * it doesn't necessarily exist. + */ + return EXIT_SUCCESS; + } + perror("open /proc/${pid}/smaps_rollup"); + return EXIT_FAILURE; + } else { + ssize_t rv = read(fd, buf, sizeof(buf)); + close(fd); + assert(rv == sizeof(g_smaps_rollup) - 1); + assert(memcmp(buf, g_smaps_rollup, sizeof(g_smaps_rollup) - 1) == 0); + return EXIT_SUCCESS; + } +} + +int main(void) +{ + int rv = EXIT_SUCCESS; + + vsyscall(); + + switch (g_vsyscall) { + case 0: + g_proc_pid_maps_vsyscall = proc_pid_maps_vsyscall_0; + g_proc_pid_smaps_vsyscall = proc_pid_smaps_vsyscall_0; + break; + case 1: + g_proc_pid_maps_vsyscall = proc_pid_maps_vsyscall_1; + g_proc_pid_smaps_vsyscall = proc_pid_smaps_vsyscall_1; + break; + case 2: + g_proc_pid_maps_vsyscall = proc_pid_maps_vsyscall_2; + g_proc_pid_smaps_vsyscall = proc_pid_smaps_vsyscall_2; + break; + default: + abort(); + } + + pid_t pid = fork(); + if (pid == -1) { + perror("fork"); + return EXIT_FAILURE; + } else if (pid == 0) { + rv = ptrace(PTRACE_TRACEME, 0, NULL, NULL); + if (rv != 0) { + if (errno == EPERM) { + fprintf(stderr, +"Did you know? ptrace(PTRACE_TRACEME) doesn't work under strace.\n" + ); + kill(getppid(), SIGTERM); + return EXIT_FAILURE; + } + perror("ptrace PTRACE_TRACEME"); + return EXIT_FAILURE; + } + + /* + * Hide "segfault at ..." messages. Signal handler won't run. + */ + struct sigaction act = {}; + act.sa_flags = SA_SIGINFO; + act.sa_sigaction = sigaction_SIGSEGV; + sigaction(SIGSEGV, &act, NULL); + +#ifdef __amd64__ + munmap(NULL, ((size_t)1 << 47) - 4096); +#else +#error "implement 'unmap everything'" +#endif + return EXIT_FAILURE; + } else { + /* + * TODO find reliable way to signal parent that munmap(2) completed. + * Child can't do it directly because it effectively doesn't exist + * anymore. Looking at child's VM files isn't 100% reliable either: + * due to a bug they may not become empty or empty-like. + */ + sleep(1); + + if (rv == EXIT_SUCCESS) { + rv = test_proc_pid_maps(pid); + } + if (rv == EXIT_SUCCESS) { + rv = test_proc_pid_numa_maps(pid); + } + if (rv == EXIT_SUCCESS) { + rv = test_proc_pid_smaps(pid); + } + if (rv == EXIT_SUCCESS) { + rv = test_proc_pid_smaps_rollup(pid); + } + /* + * TODO test /proc/${pid}/statm, task_statm() + * ->start_code, ->end_code aren't updated by munmap(). + * Output can be "0 0 0 2 0 0 0\n" where "2" can be anything. + */ + + /* Cut the rope. */ + int wstatus; + waitpid(pid, &wstatus, 0); + assert(WIFSTOPPED(wstatus)); + assert(WSTOPSIG(wstatus) == SIGSEGV); + } + + return rv; +} diff --git a/tools/testing/selftests/proc/proc-pid-vm.c b/tools/testing/selftests/proc/proc-pid-vm.c index e5962f4794f5..69551bfa215c 100644 --- a/tools/testing/selftests/proc/proc-pid-vm.c +++ b/tools/testing/selftests/proc/proc-pid-vm.c @@ -213,22 +213,22 @@ static int make_exe(const uint8_t *payload, size_t len) /* * 0: vsyscall VMA doesn't exist vsyscall=none - * 1: vsyscall VMA is r-xp vsyscall=emulate - * 2: vsyscall VMA is --xp vsyscall=xonly + * 1: vsyscall VMA is --xp vsyscall=xonly + * 2: vsyscall VMA is r-xp vsyscall=emulate */ -static int g_vsyscall; +static volatile int g_vsyscall; static const char *str_vsyscall; static const char str_vsyscall_0[] = ""; static const char str_vsyscall_1[] = -"ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]\n"; -static const char str_vsyscall_2[] = "ffffffffff600000-ffffffffff601000 --xp 00000000 00:00 0 [vsyscall]\n"; +static const char str_vsyscall_2[] = +"ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]\n"; #ifdef __x86_64__ static void sigaction_SIGSEGV(int _, siginfo_t *__, void *___) { - _exit(1); + _exit(g_vsyscall); } /* @@ -255,6 +255,7 @@ static void vsyscall(void) act.sa_sigaction = sigaction_SIGSEGV; (void)sigaction(SIGSEGV, &act, NULL); + g_vsyscall = 0; /* gettimeofday(NULL, NULL); */ asm volatile ( "call %P0" @@ -262,45 +263,20 @@ static void vsyscall(void) : "i" (0xffffffffff600000), "D" (NULL), "S" (NULL) : "rax", "rcx", "r11" ); - exit(0); - } - waitpid(pid, &wstatus, 0); - if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) == 0) { - /* vsyscall page exists and is executable. */ - } else { - /* vsyscall page doesn't exist. */ - g_vsyscall = 0; - return; - } - - pid = fork(); - if (pid < 0) { - fprintf(stderr, "fork, errno %d\n", errno); - exit(1); - } - if (pid == 0) { - struct rlimit rlim = {0, 0}; - (void)setrlimit(RLIMIT_CORE, &rlim); - - /* Hide "segfault at ffffffffff600000" messages. */ - struct sigaction act; - memset(&act, 0, sizeof(struct sigaction)); - act.sa_flags = SA_SIGINFO; - act.sa_sigaction = sigaction_SIGSEGV; - (void)sigaction(SIGSEGV, &act, NULL); + g_vsyscall = 1; *(volatile int *)0xffffffffff600000UL; - exit(0); + + g_vsyscall = 2; + exit(g_vsyscall); } waitpid(pid, &wstatus, 0); - if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) == 0) { - /* vsyscall page is readable and executable. */ - g_vsyscall = 1; - return; + if (WIFEXITED(wstatus)) { + g_vsyscall = WEXITSTATUS(wstatus); + } else { + fprintf(stderr, "error: wstatus %08x\n", wstatus); + exit(1); } - - /* vsyscall page is executable but unreadable. */ - g_vsyscall = 2; } int main(void) diff --git a/tools/testing/selftests/tpm2/tpm2.py b/tools/testing/selftests/tpm2/tpm2.py index 057a4f49c79d..c7363c6764fc 100644 --- a/tools/testing/selftests/tpm2/tpm2.py +++ b/tools/testing/selftests/tpm2/tpm2.py @@ -371,6 +371,10 @@ class Client: fcntl.fcntl(self.tpm, fcntl.F_SETFL, flags) self.tpm_poll = select.poll() + def __del__(self): + if self.tpm: + self.tpm.close() + def close(self): self.tpm.close() diff --git a/tools/testing/selftests/user_events/ftrace_test.c b/tools/testing/selftests/user_events/ftrace_test.c index a80fb5ef61d5..404a2713dcae 100644 --- a/tools/testing/selftests/user_events/ftrace_test.c +++ b/tools/testing/selftests/user_events/ftrace_test.c @@ -22,6 +22,11 @@ const char *enable_file = "/sys/kernel/debug/tracing/events/user_events/__test_e const char *trace_file = "/sys/kernel/debug/tracing/trace"; const char *fmt_file = "/sys/kernel/debug/tracing/events/user_events/__test_event/format"; +static inline int status_check(char *status_page, int status_bit) +{ + return status_page[status_bit >> 3] & (1 << (status_bit & 7)); +} + static int trace_bytes(void) { int fd = open(trace_file, O_RDONLY); @@ -197,12 +202,12 @@ TEST_F(user, register_events) { /* Register should work */ ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); ASSERT_EQ(0, reg.write_index); - ASSERT_NE(0, reg.status_index); + ASSERT_NE(0, reg.status_bit); /* Multiple registers should result in same index */ ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); ASSERT_EQ(0, reg.write_index); - ASSERT_NE(0, reg.status_index); + ASSERT_NE(0, reg.status_bit); /* Ensure disabled */ self->enable_fd = open(enable_file, O_RDWR); @@ -212,15 +217,15 @@ TEST_F(user, register_events) { /* MMAP should work and be zero'd */ ASSERT_NE(MAP_FAILED, status_page); ASSERT_NE(NULL, status_page); - ASSERT_EQ(0, status_page[reg.status_index]); + ASSERT_EQ(0, status_check(status_page, reg.status_bit)); /* Enable event and ensure bits updated in status */ ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1"))) - ASSERT_EQ(EVENT_STATUS_FTRACE, status_page[reg.status_index]); + ASSERT_NE(0, status_check(status_page, reg.status_bit)); /* Disable event and ensure bits updated in status */ ASSERT_NE(-1, write(self->enable_fd, "0", sizeof("0"))) - ASSERT_EQ(0, status_page[reg.status_index]); + ASSERT_EQ(0, status_check(status_page, reg.status_bit)); /* File still open should return -EBUSY for delete */ ASSERT_EQ(-1, ioctl(self->data_fd, DIAG_IOCSDEL, "__test_event")); @@ -240,6 +245,8 @@ TEST_F(user, write_events) { struct iovec io[3]; __u32 field1, field2; int before = 0, after = 0; + int page_size = sysconf(_SC_PAGESIZE); + char *status_page; reg.size = sizeof(reg); reg.name_args = (__u64)"__test_event u32 field1; u32 field2"; @@ -254,10 +261,18 @@ TEST_F(user, write_events) { io[2].iov_base = &field2; io[2].iov_len = sizeof(field2); + status_page = mmap(NULL, page_size, PROT_READ, MAP_SHARED, + self->status_fd, 0); + /* Register should work */ ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); ASSERT_EQ(0, reg.write_index); - ASSERT_NE(0, reg.status_index); + ASSERT_NE(0, reg.status_bit); + + /* MMAP should work and be zero'd */ + ASSERT_NE(MAP_FAILED, status_page); + ASSERT_NE(NULL, status_page); + ASSERT_EQ(0, status_check(status_page, reg.status_bit)); /* Write should fail on invalid slot with ENOENT */ io[0].iov_base = &field2; @@ -271,6 +286,9 @@ TEST_F(user, write_events) { self->enable_fd = open(enable_file, O_RDWR); ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1"))) + /* Event should now be enabled */ + ASSERT_NE(0, status_check(status_page, reg.status_bit)); + /* Write should make it out to ftrace buffers */ before = trace_bytes(); ASSERT_NE(-1, writev(self->data_fd, (const struct iovec *)io, 3)); @@ -298,7 +316,7 @@ TEST_F(user, write_fault) { /* Register should work */ ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); ASSERT_EQ(0, reg.write_index); - ASSERT_NE(0, reg.status_index); + ASSERT_NE(0, reg.status_bit); /* Write should work normally */ ASSERT_NE(-1, writev(self->data_fd, (const struct iovec *)io, 2)); @@ -315,6 +333,11 @@ TEST_F(user, write_validator) { int loc, bytes; char data[8]; int before = 0, after = 0; + int page_size = sysconf(_SC_PAGESIZE); + char *status_page; + + status_page = mmap(NULL, page_size, PROT_READ, MAP_SHARED, + self->status_fd, 0); reg.size = sizeof(reg); reg.name_args = (__u64)"__test_event __rel_loc char[] data"; @@ -322,7 +345,12 @@ TEST_F(user, write_validator) { /* Register should work */ ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); ASSERT_EQ(0, reg.write_index); - ASSERT_NE(0, reg.status_index); + ASSERT_NE(0, reg.status_bit); + + /* MMAP should work and be zero'd */ + ASSERT_NE(MAP_FAILED, status_page); + ASSERT_NE(NULL, status_page); + ASSERT_EQ(0, status_check(status_page, reg.status_bit)); io[0].iov_base = ®.write_index; io[0].iov_len = sizeof(reg.write_index); @@ -340,6 +368,9 @@ TEST_F(user, write_validator) { self->enable_fd = open(enable_file, O_RDWR); ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1"))) + /* Event should now be enabled */ + ASSERT_NE(0, status_check(status_page, reg.status_bit)); + /* Full in-bounds write should work */ before = trace_bytes(); loc = DYN_LOC(0, bytes); diff --git a/tools/testing/selftests/user_events/perf_test.c b/tools/testing/selftests/user_events/perf_test.c index 26851d51d6bb..8b4c7879d5a7 100644 --- a/tools/testing/selftests/user_events/perf_test.c +++ b/tools/testing/selftests/user_events/perf_test.c @@ -35,6 +35,11 @@ static long perf_event_open(struct perf_event_attr *pe, pid_t pid, return syscall(__NR_perf_event_open, pe, pid, cpu, group_fd, flags); } +static inline int status_check(char *status_page, int status_bit) +{ + return status_page[status_bit >> 3] & (1 << (status_bit & 7)); +} + static int get_id(void) { FILE *fp = fopen(id_file, "r"); @@ -120,8 +125,8 @@ TEST_F(user, perf_write) { /* Register should work */ ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); ASSERT_EQ(0, reg.write_index); - ASSERT_NE(0, reg.status_index); - ASSERT_EQ(0, status_page[reg.status_index]); + ASSERT_NE(0, reg.status_bit); + ASSERT_EQ(0, status_check(status_page, reg.status_bit)); /* Id should be there */ id = get_id(); @@ -144,7 +149,7 @@ TEST_F(user, perf_write) { ASSERT_NE(MAP_FAILED, perf_page); /* Status should be updated */ - ASSERT_EQ(EVENT_STATUS_PERF, status_page[reg.status_index]); + ASSERT_NE(0, status_check(status_page, reg.status_bit)); event.index = reg.write_index; event.field1 = 0xc001; diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index 31e5eea2a9b9..7b9dc2426f18 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -30,7 +30,6 @@ map_fixed_noreplace write_to_hugetlbfs hmm-tests memfd_secret -local_config.* soft-dirty split_huge_page_test ksm_tests diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index d516b8c38eed..163c2fde3cb3 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -1,9 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 # Makefile for vm selftests -LOCAL_HDRS += $(selfdir)/vm/local_config.h $(top_srcdir)/mm/gup_test.h - -include local_config.mk +LOCAL_HDRS += $(top_srcdir)/mm/gup_test.h uname_M := $(shell uname -m 2>/dev/null || echo not) MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/') @@ -97,9 +95,11 @@ TEST_FILES += va_128TBswitch.sh include ../lib.mk +$(OUTPUT)/khugepaged: vm_util.c $(OUTPUT)/madv_populate: vm_util.c $(OUTPUT)/soft-dirty: vm_util.c $(OUTPUT)/split_huge_page_test: vm_util.c +$(OUTPUT)/userfaultfd: vm_util.c ifeq ($(MACHINE),x86_64) BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) @@ -152,23 +152,6 @@ endif $(OUTPUT)/mlock-random-test $(OUTPUT)/memfd_secret: LDLIBS += -lcap -# HMM_EXTRA_LIBS may get set in local_config.mk, or it may be left empty. -$(OUTPUT)/hmm-tests: LDLIBS += $(HMM_EXTRA_LIBS) - $(OUTPUT)/ksm_tests: LDLIBS += -lnuma $(OUTPUT)/migration: LDLIBS += -lnuma - -local_config.mk local_config.h: check_config.sh - /bin/sh ./check_config.sh $(CC) - -EXTRA_CLEAN += local_config.mk local_config.h - -ifeq ($(HMM_EXTRA_LIBS),) -all: warn_missing_hugelibs - -warn_missing_hugelibs: - @echo ; \ - echo "Warning: missing libhugetlbfs support. Some HMM tests will be skipped." ; \ - echo -endif diff --git a/tools/testing/selftests/vm/check_config.sh b/tools/testing/selftests/vm/check_config.sh deleted file mode 100644 index 079c8a40b85d..000000000000 --- a/tools/testing/selftests/vm/check_config.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -# -# Probe for libraries and create header files to record the results. Both C -# header files and Makefile include fragments are created. - -OUTPUT_H_FILE=local_config.h -OUTPUT_MKFILE=local_config.mk - -# libhugetlbfs -tmpname=$(mktemp) -tmpfile_c=${tmpname}.c -tmpfile_o=${tmpname}.o - -echo "#include <sys/types.h>" > $tmpfile_c -echo "#include <hugetlbfs.h>" >> $tmpfile_c -echo "int func(void) { return 0; }" >> $tmpfile_c - -CC=${1:?"Usage: $0 <compiler> # example compiler: gcc"} -$CC -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1 - -if [ -f $tmpfile_o ]; then - echo "#define LOCAL_CONFIG_HAVE_LIBHUGETLBFS 1" > $OUTPUT_H_FILE - echo "HMM_EXTRA_LIBS = -lhugetlbfs" > $OUTPUT_MKFILE -else - echo "// No libhugetlbfs support found" > $OUTPUT_H_FILE - echo "# No libhugetlbfs support found, so:" > $OUTPUT_MKFILE - echo "HMM_EXTRA_LIBS = " >> $OUTPUT_MKFILE -fi - -rm ${tmpname}.* diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c index 98b949c279be..4adaad1b822f 100644 --- a/tools/testing/selftests/vm/hmm-tests.c +++ b/tools/testing/selftests/vm/hmm-tests.c @@ -26,10 +26,6 @@ #include <sys/mman.h> #include <sys/ioctl.h> -#include "./local_config.h" -#ifdef LOCAL_CONFIG_HAVE_LIBHUGETLBFS -#include <hugetlbfs.h> -#endif /* * This is a private UAPI to the kernel test module so it isn't exported @@ -733,7 +729,54 @@ TEST_F(hmm, anon_write_huge) hmm_buffer_free(buffer); } -#ifdef LOCAL_CONFIG_HAVE_LIBHUGETLBFS +/* + * Read numeric data from raw and tagged kernel status files. Used to read + * /proc and /sys data (without a tag) and from /proc/meminfo (with a tag). + */ +static long file_read_ulong(char *file, const char *tag) +{ + int fd; + char buf[2048]; + int len; + char *p, *q; + long val; + + fd = open(file, O_RDONLY); + if (fd < 0) { + /* Error opening the file */ + return -1; + } + + len = read(fd, buf, sizeof(buf)); + close(fd); + if (len < 0) { + /* Error in reading the file */ + return -1; + } + if (len == sizeof(buf)) { + /* Error file is too large */ + return -1; + } + buf[len] = '\0'; + + /* Search for a tag if provided */ + if (tag) { + p = strstr(buf, tag); + if (!p) + return -1; /* looks like the line we want isn't there */ + p += strlen(tag); + } else + p = buf; + + val = strtol(p, &q, 0); + if (*q != ' ') { + /* Error parsing the file */ + return -1; + } + + return val; +} + /* * Write huge TLBFS page. */ @@ -742,29 +785,27 @@ TEST_F(hmm, anon_write_hugetlbfs) struct hmm_buffer *buffer; unsigned long npages; unsigned long size; + unsigned long default_hsize; unsigned long i; int *ptr; int ret; - long pagesizes[4]; - int n, idx; - /* Skip test if we can't allocate a hugetlbfs page. */ - - n = gethugepagesizes(pagesizes, 4); - if (n <= 0) + default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:"); + if (default_hsize < 0 || default_hsize*1024 < default_hsize) SKIP(return, "Huge page size could not be determined"); - for (idx = 0; --n > 0; ) { - if (pagesizes[n] < pagesizes[idx]) - idx = n; - } - size = ALIGN(TWOMEG, pagesizes[idx]); + default_hsize = default_hsize*1024; /* KB to B */ + + size = ALIGN(TWOMEG, default_hsize); npages = size >> self->page_shift; buffer = malloc(sizeof(*buffer)); ASSERT_NE(buffer, NULL); - buffer->ptr = get_hugepage_region(size, GHR_STRICT); - if (buffer->ptr == NULL) { + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + if (buffer->ptr == MAP_FAILED) { free(buffer); SKIP(return, "Huge page could not be allocated"); } @@ -788,11 +829,10 @@ TEST_F(hmm, anon_write_hugetlbfs) for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) ASSERT_EQ(ptr[i], i); - free_hugepage_region(buffer->ptr); + munmap(buffer->ptr, buffer->size); buffer->ptr = NULL; hmm_buffer_free(buffer); } -#endif /* LOCAL_CONFIG_HAVE_LIBHUGETLBFS */ /* * Read mmap'ed file memory. @@ -1014,6 +1054,55 @@ TEST_F(hmm, migrate_fault) hmm_buffer_free(buffer); } +TEST_F(hmm, migrate_release) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Release device memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_RELEASE, buffer, npages); + ASSERT_EQ(ret, 0); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + /* * Migrate anonymous shared memory to device private memory. */ @@ -1467,7 +1556,6 @@ TEST_F(hmm2, snapshot) hmm_buffer_free(buffer); } -#ifdef LOCAL_CONFIG_HAVE_LIBHUGETLBFS /* * Test the hmm_range_fault() HMM_PFN_PMD flag for large pages that * should be mapped by a large page table entry. @@ -1477,30 +1565,30 @@ TEST_F(hmm, compound) struct hmm_buffer *buffer; unsigned long npages; unsigned long size; + unsigned long default_hsize; int *ptr; unsigned char *m; int ret; - long pagesizes[4]; - int n, idx; unsigned long i; /* Skip test if we can't allocate a hugetlbfs page. */ - n = gethugepagesizes(pagesizes, 4); - if (n <= 0) - return; - for (idx = 0; --n > 0; ) { - if (pagesizes[n] < pagesizes[idx]) - idx = n; - } - size = ALIGN(TWOMEG, pagesizes[idx]); + default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:"); + if (default_hsize < 0 || default_hsize*1024 < default_hsize) + SKIP(return, "Huge page size could not be determined"); + default_hsize = default_hsize*1024; /* KB to B */ + + size = ALIGN(TWOMEG, default_hsize); npages = size >> self->page_shift; buffer = malloc(sizeof(*buffer)); ASSERT_NE(buffer, NULL); - buffer->ptr = get_hugepage_region(size, GHR_STRICT); - if (buffer->ptr == NULL) { + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + if (buffer->ptr == MAP_FAILED) { free(buffer); return; } @@ -1539,11 +1627,10 @@ TEST_F(hmm, compound) ASSERT_EQ(m[i], HMM_DMIRROR_PROT_READ | HMM_DMIRROR_PROT_PMD); - free_hugepage_region(buffer->ptr); + munmap(buffer->ptr, buffer->size); buffer->ptr = NULL; hmm_buffer_free(buffer); } -#endif /* LOCAL_CONFIG_HAVE_LIBHUGETLBFS */ /* * Test two devices reading the same memory (double mapped). diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c index 155120b67a16..64126c8cd561 100644 --- a/tools/testing/selftests/vm/khugepaged.c +++ b/tools/testing/selftests/vm/khugepaged.c @@ -1,6 +1,9 @@ #define _GNU_SOURCE +#include <ctype.h> +#include <errno.h> #include <fcntl.h> #include <limits.h> +#include <dirent.h> #include <signal.h> #include <stdio.h> #include <stdlib.h> @@ -10,10 +13,24 @@ #include <sys/mman.h> #include <sys/wait.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/sysmacros.h> +#include <sys/vfs.h> + +#include "linux/magic.h" + +#include "vm_util.h" #ifndef MADV_PAGEOUT #define MADV_PAGEOUT 21 #endif +#ifndef MADV_POPULATE_READ +#define MADV_POPULATE_READ 22 +#endif +#ifndef MADV_COLLAPSE +#define MADV_COLLAPSE 25 +#endif #define BASE_ADDR ((void *)(1UL << 30)) static unsigned long hpage_pmd_size; @@ -22,6 +39,47 @@ static int hpage_pmd_nr; #define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/" #define PID_SMAPS "/proc/self/smaps" +#define TEST_FILE "collapse_test_file" + +#define MAX_LINE_LENGTH 500 + +enum vma_type { + VMA_ANON, + VMA_FILE, + VMA_SHMEM, +}; + +struct mem_ops { + void *(*setup_area)(int nr_hpages); + void (*cleanup_area)(void *p, unsigned long size); + void (*fault)(void *p, unsigned long start, unsigned long end); + bool (*check_huge)(void *addr, int nr_hpages); + const char *name; +}; + +static struct mem_ops *file_ops; +static struct mem_ops *anon_ops; +static struct mem_ops *shmem_ops; + +struct collapse_context { + void (*collapse)(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops, bool expect); + bool enforce_pte_scan_limits; + const char *name; +}; + +static struct collapse_context *khugepaged_context; +static struct collapse_context *madvise_context; + +struct file_info { + const char *dir; + char path[PATH_MAX]; + enum vma_type type; + int fd; + char dev_queue_read_ahead_path[PATH_MAX]; +}; + +static struct file_info finfo; enum thp_enabled { THP_ALWAYS, @@ -88,18 +146,7 @@ struct settings { enum shmem_enabled shmem_enabled; bool use_zero_page; struct khugepaged_settings khugepaged; -}; - -static struct settings default_settings = { - .thp_enabled = THP_MADVISE, - .thp_defrag = THP_DEFRAG_ALWAYS, - .shmem_enabled = SHMEM_NEVER, - .use_zero_page = 0, - .khugepaged = { - .defrag = 1, - .alloc_sleep_millisecs = 10, - .scan_sleep_millisecs = 10, - }, + unsigned long read_ahead_kb; }; static struct settings saved_settings; @@ -118,6 +165,11 @@ static void fail(const char *msg) exit_status++; } +static void skip(const char *msg) +{ + printf(" \e[33m%s\e[0m\n", msg); +} + static int read_file(const char *path, char *buf, size_t buflen) { int fd; @@ -145,13 +197,19 @@ static int write_file(const char *path, const char *buf, size_t buflen) ssize_t numwritten; fd = open(path, O_WRONLY); - if (fd == -1) + if (fd == -1) { + printf("open(%s)\n", path); + exit(EXIT_FAILURE); return 0; + } numwritten = write(fd, buf, buflen - 1); close(fd); - if (numwritten < 1) + if (numwritten < 1) { + printf("write(%s)\n", buf); + exit(EXIT_FAILURE); return 0; + } return (unsigned int) numwritten; } @@ -218,20 +276,11 @@ static void write_string(const char *name, const char *val) } } -static const unsigned long read_num(const char *name) +static const unsigned long _read_num(const char *path) { - char path[PATH_MAX]; char buf[21]; - int ret; - - ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); - if (ret >= PATH_MAX) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - ret = read_file(path, buf, sizeof(buf)); - if (ret < 0) { + if (read_file(path, buf, sizeof(buf)) < 0) { perror("read_file(read_num)"); exit(EXIT_FAILURE); } @@ -239,10 +288,9 @@ static const unsigned long read_num(const char *name) return strtoul(buf, NULL, 10); } -static void write_num(const char *name, unsigned long num) +static const unsigned long read_num(const char *name) { char path[PATH_MAX]; - char buf[21]; int ret; ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); @@ -250,6 +298,12 @@ static void write_num(const char *name, unsigned long num) printf("%s: Pathname is too long\n", __func__); exit(EXIT_FAILURE); } + return _read_num(path); +} + +static void _write_num(const char *path, unsigned long num) +{ + char buf[21]; sprintf(buf, "%ld", num); if (!write_file(path, buf, strlen(buf) + 1)) { @@ -258,6 +312,19 @@ static void write_num(const char *name, unsigned long num) } } +static void write_num(const char *name, unsigned long num) +{ + char path[PATH_MAX]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + _write_num(path, num); +} + static void write_settings(struct settings *settings) { struct khugepaged_settings *khugepaged = &settings->khugepaged; @@ -277,6 +344,43 @@ static void write_settings(struct settings *settings) write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap); write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared); write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan); + + if (file_ops && finfo.type == VMA_FILE) + _write_num(finfo.dev_queue_read_ahead_path, + settings->read_ahead_kb); +} + +#define MAX_SETTINGS_DEPTH 4 +static struct settings settings_stack[MAX_SETTINGS_DEPTH]; +static int settings_index; + +static struct settings *current_settings(void) +{ + if (!settings_index) { + printf("Fail: No settings set"); + exit(EXIT_FAILURE); + } + return settings_stack + settings_index - 1; +} + +static void push_settings(struct settings *settings) +{ + if (settings_index >= MAX_SETTINGS_DEPTH) { + printf("Fail: Settings stack exceeded"); + exit(EXIT_FAILURE); + } + settings_stack[settings_index++] = *settings; + write_settings(current_settings()); +} + +static void pop_settings(void) +{ + if (settings_index <= 0) { + printf("Fail: Settings stack empty"); + exit(EXIT_FAILURE); + } + --settings_index; + write_settings(current_settings()); } static void restore_settings(int sig) @@ -314,6 +418,10 @@ static void save_settings(void) .max_ptes_shared = read_num("khugepaged/max_ptes_shared"), .pages_to_scan = read_num("khugepaged/pages_to_scan"), }; + if (file_ops && finfo.type == VMA_FILE) + saved_settings.read_ahead_kb = + _read_num(finfo.dev_queue_read_ahead_path); + success("OK"); signal(SIGTERM, restore_settings); @@ -322,72 +430,90 @@ static void save_settings(void) signal(SIGQUIT, restore_settings); } -static void adjust_settings(void) +static void get_finfo(const char *dir) { + struct stat path_stat; + struct statfs fs; + char buf[1 << 10]; + char path[PATH_MAX]; + char *str, *end; - printf("Adjust settings..."); - write_settings(&default_settings); - success("OK"); -} - -#define MAX_LINE_LENGTH 500 - -static bool check_for_pattern(FILE *fp, char *pattern, char *buf) -{ - while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) { - if (!strncmp(buf, pattern, strlen(pattern))) - return true; + finfo.dir = dir; + stat(finfo.dir, &path_stat); + if (!S_ISDIR(path_stat.st_mode)) { + printf("%s: Not a directory (%s)\n", __func__, finfo.dir); + exit(EXIT_FAILURE); } - return false; -} - -static bool check_huge(void *addr) -{ - bool thp = false; - int ret; - FILE *fp; - char buffer[MAX_LINE_LENGTH]; - char addr_pattern[MAX_LINE_LENGTH]; - - ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", - (unsigned long) addr); - if (ret >= MAX_LINE_LENGTH) { - printf("%s: Pattern is too long\n", __func__); + if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE, + finfo.dir) >= sizeof(finfo.path)) { + printf("%s: Pathname is too long\n", __func__); exit(EXIT_FAILURE); } - - - fp = fopen(PID_SMAPS, "r"); - if (!fp) { - printf("%s: Failed to open file %s\n", __func__, PID_SMAPS); + if (statfs(finfo.dir, &fs)) { + perror("statfs()"); exit(EXIT_FAILURE); } - if (!check_for_pattern(fp, addr_pattern, buffer)) - goto err_out; - - ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "AnonHugePages:%10ld kB", - hpage_pmd_size >> 10); - if (ret >= MAX_LINE_LENGTH) { - printf("%s: Pattern is too long\n", __func__); + finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE; + if (finfo.type == VMA_SHMEM) + return; + + /* Find owning device's queue/read_ahead_kb control */ + if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent", + major(path_stat.st_dev), minor(path_stat.st_dev)) + >= sizeof(path)) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + if (read_file(path, buf, sizeof(buf)) < 0) { + perror("read_file(read_num)"); + exit(EXIT_FAILURE); + } + if (strstr(buf, "DEVTYPE=disk")) { + /* Found it */ + if (snprintf(finfo.dev_queue_read_ahead_path, + sizeof(finfo.dev_queue_read_ahead_path), + "/sys/dev/block/%d:%d/queue/read_ahead_kb", + major(path_stat.st_dev), minor(path_stat.st_dev)) + >= sizeof(finfo.dev_queue_read_ahead_path)) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + return; + } + if (!strstr(buf, "DEVTYPE=partition")) { + printf("%s: Unknown device type: %s\n", __func__, path); exit(EXIT_FAILURE); } /* - * Fetch the AnonHugePages: in the same block and check whether it got - * the expected number of hugeepages next. + * Partition of block device - need to find actual device. + * Using naming convention that devnameN is partition of + * device devname. */ - if (!check_for_pattern(fp, "AnonHugePages:", buffer)) - goto err_out; - - if (strncmp(buffer, addr_pattern, strlen(addr_pattern))) - goto err_out; - - thp = true; -err_out: - fclose(fp); - return thp; + str = strstr(buf, "DEVNAME="); + if (!str) { + printf("%s: Could not read: %s", __func__, path); + exit(EXIT_FAILURE); + } + str += 8; + end = str; + while (*end) { + if (isdigit(*end)) { + *end = '\0'; + if (snprintf(finfo.dev_queue_read_ahead_path, + sizeof(finfo.dev_queue_read_ahead_path), + "/sys/block/%s/queue/read_ahead_kb", + str) >= sizeof(finfo.dev_queue_read_ahead_path)) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + return; + } + ++end; + } + printf("%s: Could not read: %s\n", __func__, path); + exit(EXIT_FAILURE); } - static bool check_swap(void *addr, unsigned long size) { bool swap = false; @@ -409,7 +535,7 @@ static bool check_swap(void *addr, unsigned long size) printf("%s: Failed to open file %s\n", __func__, PID_SMAPS); exit(EXIT_FAILURE); } - if (!check_for_pattern(fp, addr_pattern, buffer)) + if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer))) goto err_out; ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB", @@ -422,7 +548,7 @@ static bool check_swap(void *addr, unsigned long size) * Fetch the Swap: in the same block and check whether it got * the expected number of hugeepages next. */ - if (!check_for_pattern(fp, "Swap:", buffer)) + if (!check_for_pattern(fp, "Swap:", buffer, sizeof(buffer))) goto err_out; if (strncmp(buffer, addr_pattern, strlen(addr_pattern))) @@ -434,12 +560,12 @@ err_out: return swap; } -static void *alloc_mapping(void) +static void *alloc_mapping(int nr) { void *p; - p = mmap(BASE_ADDR, hpage_pmd_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (p != BASE_ADDR) { printf("Failed to allocate VMA at %p\n", BASE_ADDR); exit(EXIT_FAILURE); @@ -456,6 +582,60 @@ static void fill_memory(int *p, unsigned long start, unsigned long end) p[i * page_size / sizeof(*p)] = i + 0xdead0000; } +/* + * MADV_COLLAPSE is a best-effort request and may fail if an internal + * resource is temporarily unavailable, in which case it will set errno to + * EAGAIN. In such a case, immediately reattempt the operation one more + * time. + */ +static int madvise_collapse_retry(void *p, unsigned long size) +{ + bool retry = true; + int ret; + +retry: + ret = madvise(p, size, MADV_COLLAPSE); + if (ret && errno == EAGAIN && retry) { + retry = false; + goto retry; + } + return ret; +} + +/* + * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with + * validate_memory()'able contents. + */ +static void *alloc_hpage(struct mem_ops *ops) +{ + void *p = ops->setup_area(1); + + ops->fault(p, 0, hpage_pmd_size); + + /* + * VMA should be neither VM_HUGEPAGE nor VM_NOHUGEPAGE. + * The latter is ineligible for collapse by MADV_COLLAPSE + * while the former might cause MADV_COLLAPSE to race with + * khugepaged on low-load system (like a test machine), which + * would cause MADV_COLLAPSE to fail with EAGAIN. + */ + printf("Allocate huge page..."); + if (madvise_collapse_retry(p, hpage_pmd_size)) { + perror("madvise(MADV_COLLAPSE)"); + exit(EXIT_FAILURE); + } + if (!ops->check_huge(p, 1)) { + perror("madvise(MADV_COLLAPSE)"); + exit(EXIT_FAILURE); + } + if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) { + perror("madvise(MADV_HUGEPAGE)"); + exit(EXIT_FAILURE); + } + success("OK"); + return p; +} + static void validate_memory(int *p, unsigned long start, unsigned long end) { int i; @@ -469,26 +649,216 @@ static void validate_memory(int *p, unsigned long start, unsigned long end) } } +static void *anon_setup_area(int nr_hpages) +{ + return alloc_mapping(nr_hpages); +} + +static void anon_cleanup_area(void *p, unsigned long size) +{ + munmap(p, size); +} + +static void anon_fault(void *p, unsigned long start, unsigned long end) +{ + fill_memory(p, start, end); +} + +static bool anon_check_huge(void *addr, int nr_hpages) +{ + return check_huge_anon(addr, nr_hpages, hpage_pmd_size); +} + +static void *file_setup_area(int nr_hpages) +{ + int fd; + void *p; + unsigned long size; + + unlink(finfo.path); /* Cleanup from previous failed tests */ + printf("Creating %s for collapse%s...", finfo.path, + finfo.type == VMA_SHMEM ? " (tmpfs)" : ""); + fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL, + 777); + if (fd < 0) { + perror("open()"); + exit(EXIT_FAILURE); + } + + size = nr_hpages * hpage_pmd_size; + p = alloc_mapping(nr_hpages); + fill_memory(p, 0, size); + write(fd, p, size); + close(fd); + munmap(p, size); + success("OK"); + + printf("Opening %s read only for collapse...", finfo.path); + finfo.fd = open(finfo.path, O_RDONLY, 777); + if (finfo.fd < 0) { + perror("open()"); + exit(EXIT_FAILURE); + } + p = mmap(BASE_ADDR, size, PROT_READ | PROT_EXEC, + MAP_PRIVATE, finfo.fd, 0); + if (p == MAP_FAILED || p != BASE_ADDR) { + perror("mmap()"); + exit(EXIT_FAILURE); + } + + /* Drop page cache */ + write_file("/proc/sys/vm/drop_caches", "3", 2); + success("OK"); + return p; +} + +static void file_cleanup_area(void *p, unsigned long size) +{ + munmap(p, size); + close(finfo.fd); + unlink(finfo.path); +} + +static void file_fault(void *p, unsigned long start, unsigned long end) +{ + if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) { + perror("madvise(MADV_POPULATE_READ"); + exit(EXIT_FAILURE); + } +} + +static bool file_check_huge(void *addr, int nr_hpages) +{ + switch (finfo.type) { + case VMA_FILE: + return check_huge_file(addr, nr_hpages, hpage_pmd_size); + case VMA_SHMEM: + return check_huge_shmem(addr, nr_hpages, hpage_pmd_size); + default: + exit(EXIT_FAILURE); + return false; + } +} + +static void *shmem_setup_area(int nr_hpages) +{ + void *p; + unsigned long size = nr_hpages * hpage_pmd_size; + + finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0); + if (finfo.fd < 0) { + perror("memfd_create()"); + exit(EXIT_FAILURE); + } + if (ftruncate(finfo.fd, size)) { + perror("ftruncate()"); + exit(EXIT_FAILURE); + } + p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd, + 0); + if (p != BASE_ADDR) { + perror("mmap()"); + exit(EXIT_FAILURE); + } + return p; +} + +static void shmem_cleanup_area(void *p, unsigned long size) +{ + munmap(p, size); + close(finfo.fd); +} + +static bool shmem_check_huge(void *addr, int nr_hpages) +{ + return check_huge_shmem(addr, nr_hpages, hpage_pmd_size); +} + +static struct mem_ops __anon_ops = { + .setup_area = &anon_setup_area, + .cleanup_area = &anon_cleanup_area, + .fault = &anon_fault, + .check_huge = &anon_check_huge, + .name = "anon", +}; + +static struct mem_ops __file_ops = { + .setup_area = &file_setup_area, + .cleanup_area = &file_cleanup_area, + .fault = &file_fault, + .check_huge = &file_check_huge, + .name = "file", +}; + +static struct mem_ops __shmem_ops = { + .setup_area = &shmem_setup_area, + .cleanup_area = &shmem_cleanup_area, + .fault = &anon_fault, + .check_huge = &shmem_check_huge, + .name = "shmem", +}; + +static void __madvise_collapse(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops, bool expect) +{ + int ret; + struct settings settings = *current_settings(); + + printf("%s...", msg); + + /* + * Prevent khugepaged interference and tests that MADV_COLLAPSE + * ignores /sys/kernel/mm/transparent_hugepage/enabled + */ + settings.thp_enabled = THP_NEVER; + settings.shmem_enabled = SHMEM_NEVER; + push_settings(&settings); + + /* Clear VM_NOHUGEPAGE */ + madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); + ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size); + if (((bool)ret) == expect) + fail("Fail: Bad return value"); + else if (!ops->check_huge(p, expect ? nr_hpages : 0)) + fail("Fail: check_huge()"); + else + success("OK"); + + pop_settings(); +} + +static void madvise_collapse(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops, bool expect) +{ + /* Sanity check */ + if (!ops->check_huge(p, 0)) { + printf("Unexpected huge page\n"); + exit(EXIT_FAILURE); + } + __madvise_collapse(msg, p, nr_hpages, ops, expect); +} + #define TICK 500000 -static bool wait_for_scan(const char *msg, char *p) +static bool wait_for_scan(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops) { int full_scans; int timeout = 6; /* 3 seconds */ /* Sanity check */ - if (check_huge(p)) { + if (!ops->check_huge(p, 0)) { printf("Unexpected huge page\n"); exit(EXIT_FAILURE); } - madvise(p, hpage_pmd_size, MADV_HUGEPAGE); + madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); /* Wait until the second full_scan completed */ full_scans = read_num("khugepaged/full_scans") + 2; printf("%s...", msg); while (timeout--) { - if (check_huge(p)) + if (ops->check_huge(p, nr_hpages)) break; if (read_num("khugepaged/full_scans") >= full_scans) break; @@ -496,122 +866,155 @@ static bool wait_for_scan(const char *msg, char *p) usleep(TICK); } - madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + madvise(p, nr_hpages * hpage_pmd_size, MADV_NOHUGEPAGE); return timeout == -1; } +static void khugepaged_collapse(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops, bool expect) +{ + if (wait_for_scan(msg, p, nr_hpages, ops)) { + if (expect) + fail("Timeout"); + else + success("OK"); + return; + } + + /* + * For file and shmem memory, khugepaged only retracts pte entries after + * putting the new hugepage in the page cache. The hugepage must be + * subsequently refaulted to install the pmd mapping for the mm. + */ + if (ops != &__anon_ops) + ops->fault(p, 0, nr_hpages * hpage_pmd_size); + + if (ops->check_huge(p, expect ? nr_hpages : 0)) + success("OK"); + else + fail("Fail"); +} + +static struct collapse_context __khugepaged_context = { + .collapse = &khugepaged_collapse, + .enforce_pte_scan_limits = true, + .name = "khugepaged", +}; + +static struct collapse_context __madvise_context = { + .collapse = &madvise_collapse, + .enforce_pte_scan_limits = false, + .name = "madvise", +}; + +static bool is_tmpfs(struct mem_ops *ops) +{ + return ops == &__file_ops && finfo.type == VMA_SHMEM; +} + static void alloc_at_fault(void) { - struct settings settings = default_settings; + struct settings settings = *current_settings(); char *p; settings.thp_enabled = THP_ALWAYS; - write_settings(&settings); + push_settings(&settings); - p = alloc_mapping(); + p = alloc_mapping(1); *p = 1; printf("Allocate huge page on fault..."); - if (check_huge(p)) + if (check_huge_anon(p, 1, hpage_pmd_size)) success("OK"); else fail("Fail"); - write_settings(&default_settings); + pop_settings(); madvise(p, page_size, MADV_DONTNEED); printf("Split huge PMD on MADV_DONTNEED..."); - if (!check_huge(p)) + if (check_huge_anon(p, 0, hpage_pmd_size)) success("OK"); else fail("Fail"); munmap(p, hpage_pmd_size); } -static void collapse_full(void) +static void collapse_full(struct collapse_context *c, struct mem_ops *ops) { void *p; - - p = alloc_mapping(); - fill_memory(p, 0, hpage_pmd_size); - if (wait_for_scan("Collapse fully populated PTE table", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); - validate_memory(p, 0, hpage_pmd_size); - munmap(p, hpage_pmd_size); + int nr_hpages = 4; + unsigned long size = nr_hpages * hpage_pmd_size; + + p = ops->setup_area(nr_hpages); + ops->fault(p, 0, size); + c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages, + ops, true); + validate_memory(p, 0, size); + ops->cleanup_area(p, size); } -static void collapse_empty(void) +static void collapse_empty(struct collapse_context *c, struct mem_ops *ops) { void *p; - p = alloc_mapping(); - if (wait_for_scan("Do not collapse empty PTE table", p)) - fail("Timeout"); - else if (check_huge(p)) - fail("Fail"); - else - success("OK"); - munmap(p, hpage_pmd_size); + p = ops->setup_area(1); + c->collapse("Do not collapse empty PTE table", p, 1, ops, false); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_single_pte_entry(void) +static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops) { void *p; - p = alloc_mapping(); - fill_memory(p, 0, page_size); - if (wait_for_scan("Collapse PTE table with single PTE entry present", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); - validate_memory(p, 0, page_size); - munmap(p, hpage_pmd_size); + p = ops->setup_area(1); + ops->fault(p, 0, page_size); + c->collapse("Collapse PTE table with single PTE entry present", p, + 1, ops, true); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_max_ptes_none(void) +static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops) { int max_ptes_none = hpage_pmd_nr / 2; - struct settings settings = default_settings; + struct settings settings = *current_settings(); void *p; settings.khugepaged.max_ptes_none = max_ptes_none; - write_settings(&settings); + push_settings(&settings); - p = alloc_mapping(); + p = ops->setup_area(1); - fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); - if (wait_for_scan("Do not collapse with max_ptes_none exceeded", p)) - fail("Timeout"); - else if (check_huge(p)) - fail("Fail"); - else - success("OK"); - validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); + if (is_tmpfs(ops)) { + /* shmem pages always in the page cache */ + printf("tmpfs..."); + skip("Skip"); + goto skip; + } - fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); - if (wait_for_scan("Collapse with max_ptes_none PTEs empty", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); - validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); + c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1, + ops, !c->enforce_pte_scan_limits); + validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); - munmap(p, hpage_pmd_size); - write_settings(&default_settings); + if (c->enforce_pte_scan_limits) { + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); + c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops, + true); + validate_memory(p, 0, + (hpage_pmd_nr - max_ptes_none) * page_size); + } +skip: + ops->cleanup_area(p, hpage_pmd_size); + pop_settings(); } -static void collapse_swapin_single_pte(void) +static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops) { void *p; - p = alloc_mapping(); - fill_memory(p, 0, hpage_pmd_size); + + p = ops->setup_area(1); + ops->fault(p, 0, hpage_pmd_size); printf("Swapout one page..."); if (madvise(p, page_size, MADV_PAGEOUT)) { @@ -625,25 +1028,21 @@ static void collapse_swapin_single_pte(void) goto out; } - if (wait_for_scan("Collapse with swapping in single PTE entry", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Collapse with swapping in single PTE entry", p, 1, ops, + true); validate_memory(p, 0, hpage_pmd_size); out: - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_max_ptes_swap(void) +static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops) { int max_ptes_swap = read_num("khugepaged/max_ptes_swap"); void *p; - p = alloc_mapping(); + p = ops->setup_area(1); + ops->fault(p, 0, hpage_pmd_size); - fill_memory(p, 0, hpage_pmd_size); printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr); if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) { perror("madvise(MADV_PAGEOUT)"); @@ -656,115 +1055,93 @@ static void collapse_max_ptes_swap(void) goto out; } - if (wait_for_scan("Do not collapse with max_ptes_swap exceeded", p)) - fail("Timeout"); - else if (check_huge(p)) - fail("Fail"); - else - success("OK"); + c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, ops, + !c->enforce_pte_scan_limits); validate_memory(p, 0, hpage_pmd_size); - fill_memory(p, 0, hpage_pmd_size); - printf("Swapout %d of %d pages...", max_ptes_swap, hpage_pmd_nr); - if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) { - perror("madvise(MADV_PAGEOUT)"); - exit(EXIT_FAILURE); - } - if (check_swap(p, max_ptes_swap * page_size)) { - success("OK"); - } else { - fail("Fail"); - goto out; - } + if (c->enforce_pte_scan_limits) { + ops->fault(p, 0, hpage_pmd_size); + printf("Swapout %d of %d pages...", max_ptes_swap, + hpage_pmd_nr); + if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) { + perror("madvise(MADV_PAGEOUT)"); + exit(EXIT_FAILURE); + } + if (check_swap(p, max_ptes_swap * page_size)) { + success("OK"); + } else { + fail("Fail"); + goto out; + } - if (wait_for_scan("Collapse with max_ptes_swap pages swapped out", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); - validate_memory(p, 0, hpage_pmd_size); + c->collapse("Collapse with max_ptes_swap pages swapped out", p, + 1, ops, true); + validate_memory(p, 0, hpage_pmd_size); + } out: - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_single_pte_entry_compound(void) +static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops) { void *p; - p = alloc_mapping(); + p = alloc_hpage(ops); - printf("Allocate huge page..."); - madvise(p, hpage_pmd_size, MADV_HUGEPAGE); - fill_memory(p, 0, hpage_pmd_size); - if (check_huge(p)) - success("OK"); - else - fail("Fail"); - madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + if (is_tmpfs(ops)) { + /* MADV_DONTNEED won't evict tmpfs pages */ + printf("tmpfs..."); + skip("Skip"); + goto skip; + } + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); printf("Split huge page leaving single PTE mapping compound page..."); madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED); - if (!check_huge(p)) + if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); - if (wait_for_scan("Collapse PTE table with single PTE mapping compound page", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Collapse PTE table with single PTE mapping compound page", + p, 1, ops, true); validate_memory(p, 0, page_size); - munmap(p, hpage_pmd_size); +skip: + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_full_of_compound(void) +static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops) { void *p; - p = alloc_mapping(); - - printf("Allocate huge page..."); - madvise(p, hpage_pmd_size, MADV_HUGEPAGE); - fill_memory(p, 0, hpage_pmd_size); - if (check_huge(p)) - success("OK"); - else - fail("Fail"); - + p = alloc_hpage(ops); printf("Split huge page leaving single PTE page table full of compound pages..."); madvise(p, page_size, MADV_NOHUGEPAGE); madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); - if (!check_huge(p)) + if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); - if (wait_for_scan("Collapse PTE table full of compound pages", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Collapse PTE table full of compound pages", p, 1, ops, + true); validate_memory(p, 0, hpage_pmd_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_compound_extreme(void) +static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops) { void *p; int i; - p = alloc_mapping(); + p = ops->setup_area(1); for (i = 0; i < hpage_pmd_nr; i++) { printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...", i + 1, hpage_pmd_nr); madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE); - fill_memory(BASE_ADDR, 0, hpage_pmd_size); - if (!check_huge(BASE_ADDR)) { + ops->fault(BASE_ADDR, 0, hpage_pmd_size); + if (!ops->check_huge(BASE_ADDR, 1)) { printf("Failed to allocate huge page\n"); exit(EXIT_FAILURE); } @@ -791,34 +1168,30 @@ static void collapse_compound_extreme(void) } } - munmap(BASE_ADDR, hpage_pmd_size); - fill_memory(p, 0, hpage_pmd_size); - if (!check_huge(p)) + ops->cleanup_area(BASE_ADDR, hpage_pmd_size); + ops->fault(p, 0, hpage_pmd_size); + if (!ops->check_huge(p, 1)) success("OK"); else fail("Fail"); - if (wait_for_scan("Collapse PTE table full of different compound pages", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Collapse PTE table full of different compound pages", p, 1, + ops, true); validate_memory(p, 0, hpage_pmd_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_fork(void) +static void collapse_fork(struct collapse_context *c, struct mem_ops *ops) { int wstatus; void *p; - p = alloc_mapping(); + p = ops->setup_area(1); printf("Allocate small page..."); - fill_memory(p, 0, page_size); - if (!check_huge(p)) + ops->fault(p, 0, page_size); + if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); @@ -829,22 +1202,17 @@ static void collapse_fork(void) skip_settings_restore = true; exit_status = 0; - if (!check_huge(p)) + if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); - fill_memory(p, page_size, 2 * page_size); - - if (wait_for_scan("Collapse PTE table with single page shared with parent process", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + ops->fault(p, page_size, 2 * page_size); + c->collapse("Collapse PTE table with single page shared with parent process", + p, 1, ops, true); validate_memory(p, 0, page_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); exit(exit_status); } @@ -852,36 +1220,27 @@ static void collapse_fork(void) exit_status += WEXITSTATUS(wstatus); printf("Check if parent still has small page..."); - if (!check_huge(p)) + if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); validate_memory(p, 0, page_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_fork_compound(void) +static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops) { int wstatus; void *p; - p = alloc_mapping(); - - printf("Allocate huge page..."); - madvise(p, hpage_pmd_size, MADV_HUGEPAGE); - fill_memory(p, 0, hpage_pmd_size); - if (check_huge(p)) - success("OK"); - else - fail("Fail"); - + p = alloc_hpage(ops); printf("Share huge page over fork()..."); if (!fork()) { /* Do not touch settings on child exit */ skip_settings_restore = true; exit_status = 0; - if (check_huge(p)) + if (ops->check_huge(p, 1)) success("OK"); else fail("Fail"); @@ -889,24 +1248,20 @@ static void collapse_fork_compound(void) printf("Split huge page PMD in child process..."); madvise(p, page_size, MADV_NOHUGEPAGE); madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); - if (!check_huge(p)) + if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); - fill_memory(p, 0, page_size); + ops->fault(p, 0, page_size); write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1); - if (wait_for_scan("Collapse PTE table full of compound pages in child", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Collapse PTE table full of compound pages in child", + p, 1, ops, true); write_num("khugepaged/max_ptes_shared", - default_settings.khugepaged.max_ptes_shared); + current_settings()->khugepaged.max_ptes_shared); validate_memory(p, 0, hpage_pmd_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); exit(exit_status); } @@ -914,74 +1269,59 @@ static void collapse_fork_compound(void) exit_status += WEXITSTATUS(wstatus); printf("Check if parent still has huge page..."); - if (check_huge(p)) + if (ops->check_huge(p, 1)) success("OK"); else fail("Fail"); validate_memory(p, 0, hpage_pmd_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_max_ptes_shared() +static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops) { int max_ptes_shared = read_num("khugepaged/max_ptes_shared"); int wstatus; void *p; - p = alloc_mapping(); - - printf("Allocate huge page..."); - madvise(p, hpage_pmd_size, MADV_HUGEPAGE); - fill_memory(p, 0, hpage_pmd_size); - if (check_huge(p)) - success("OK"); - else - fail("Fail"); - + p = alloc_hpage(ops); printf("Share huge page over fork()..."); if (!fork()) { /* Do not touch settings on child exit */ skip_settings_restore = true; exit_status = 0; - if (check_huge(p)) + if (ops->check_huge(p, 1)) success("OK"); else fail("Fail"); printf("Trigger CoW on page %d of %d...", hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr); - fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size); - if (!check_huge(p)) - success("OK"); - else - fail("Fail"); - - if (wait_for_scan("Do not collapse with max_ptes_shared exceeded", p)) - fail("Timeout"); - else if (!check_huge(p)) - success("OK"); - else - fail("Fail"); - - printf("Trigger CoW on page %d of %d...", - hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr); - fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared) * page_size); - if (!check_huge(p)) + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size); + if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); - - if (wait_for_scan("Collapse with max_ptes_shared PTEs shared", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Maybe collapse with max_ptes_shared exceeded", p, + 1, ops, !c->enforce_pte_scan_limits); + + if (c->enforce_pte_scan_limits) { + printf("Trigger CoW on page %d of %d...", + hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr); + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) * + page_size); + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + + c->collapse("Collapse with max_ptes_shared PTEs shared", + p, 1, ops, true); + } validate_memory(p, 0, hpage_pmd_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); exit(exit_status); } @@ -989,20 +1329,153 @@ static void collapse_max_ptes_shared() exit_status += WEXITSTATUS(wstatus); printf("Check if parent still has huge page..."); - if (check_huge(p)) + if (ops->check_huge(p, 1)) success("OK"); else fail("Fail"); validate_memory(p, 0, hpage_pmd_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } -int main(void) +static void madvise_collapse_existing_thps(struct collapse_context *c, + struct mem_ops *ops) { + void *p; + + p = ops->setup_area(1); + ops->fault(p, 0, hpage_pmd_size); + c->collapse("Collapse fully populated PTE table...", p, 1, ops, true); + validate_memory(p, 0, hpage_pmd_size); + + /* c->collapse() will find a hugepage and complain - call directly. */ + __madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true); + validate_memory(p, 0, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); +} + +/* + * Test race with khugepaged where page tables have been retracted and + * pmd cleared. + */ +static void madvise_retracted_page_tables(struct collapse_context *c, + struct mem_ops *ops) +{ + void *p; + int nr_hpages = 1; + unsigned long size = nr_hpages * hpage_pmd_size; + + p = ops->setup_area(nr_hpages); + ops->fault(p, 0, size); + + /* Let khugepaged collapse and leave pmd cleared */ + if (wait_for_scan("Collapse and leave PMD cleared", p, nr_hpages, + ops)) { + fail("Timeout"); + return; + } + success("OK"); + c->collapse("Install huge PMD from page cache", p, nr_hpages, ops, + true); + validate_memory(p, 0, size); + ops->cleanup_area(p, size); +} + +static void usage(void) +{ + fprintf(stderr, "\nUsage: ./khugepaged <test type> [dir]\n\n"); + fprintf(stderr, "\t<test type>\t: <context>:<mem_type>\n"); + fprintf(stderr, "\t<context>\t: [all|khugepaged|madvise]\n"); + fprintf(stderr, "\t<mem_type>\t: [all|anon|file|shmem]\n"); + fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n"); + fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n"); + fprintf(stderr, "\tCONFIG_READ_ONLY_THP_FOR_FS=y\n"); + fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n"); + fprintf(stderr, "\tmounted with huge=madvise option for khugepaged tests to work\n"); + exit(1); +} + +static void parse_test_type(int argc, const char **argv) +{ + char *buf; + const char *token; + + if (argc == 1) { + /* Backwards compatibility */ + khugepaged_context = &__khugepaged_context; + madvise_context = &__madvise_context; + anon_ops = &__anon_ops; + return; + } + + buf = strdup(argv[1]); + token = strsep(&buf, ":"); + + if (!strcmp(token, "all")) { + khugepaged_context = &__khugepaged_context; + madvise_context = &__madvise_context; + } else if (!strcmp(token, "khugepaged")) { + khugepaged_context = &__khugepaged_context; + } else if (!strcmp(token, "madvise")) { + madvise_context = &__madvise_context; + } else { + usage(); + } + + if (!buf) + usage(); + + if (!strcmp(buf, "all")) { + file_ops = &__file_ops; + anon_ops = &__anon_ops; + shmem_ops = &__shmem_ops; + } else if (!strcmp(buf, "anon")) { + anon_ops = &__anon_ops; + } else if (!strcmp(buf, "file")) { + file_ops = &__file_ops; + } else if (!strcmp(buf, "shmem")) { + shmem_ops = &__shmem_ops; + } else { + usage(); + } + + if (!file_ops) + return; + + if (argc != 3) + usage(); +} + +int main(int argc, const char **argv) +{ + struct settings default_settings = { + .thp_enabled = THP_MADVISE, + .thp_defrag = THP_DEFRAG_ALWAYS, + .shmem_enabled = SHMEM_ADVISE, + .use_zero_page = 0, + .khugepaged = { + .defrag = 1, + .alloc_sleep_millisecs = 10, + .scan_sleep_millisecs = 10, + }, + /* + * When testing file-backed memory, the collapse path + * looks at how many pages are found in the page cache, not + * what pages are mapped. Disable read ahead optimization so + * pages don't find their way into the page cache unless + * we mem_ops->fault() them in. + */ + .read_ahead_kb = 0, + }; + + parse_test_type(argc, argv); + + if (file_ops) + get_finfo(argv[2]); + setbuf(stdout, NULL); page_size = getpagesize(); - hpage_pmd_size = read_num("hpage_pmd_size"); + hpage_pmd_size = read_pmd_pagesize(); hpage_pmd_nr = hpage_pmd_size / page_size; default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1; @@ -1011,21 +1484,75 @@ int main(void) default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8; save_settings(); - adjust_settings(); + push_settings(&default_settings); alloc_at_fault(); - collapse_full(); - collapse_empty(); - collapse_single_pte_entry(); - collapse_max_ptes_none(); - collapse_swapin_single_pte(); - collapse_max_ptes_swap(); - collapse_single_pte_entry_compound(); - collapse_full_of_compound(); - collapse_compound_extreme(); - collapse_fork(); - collapse_fork_compound(); - collapse_max_ptes_shared(); + +#define TEST(t, c, o) do { \ + if (c && o) { \ + printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \ + t(c, o); \ + } \ + } while (0) + + TEST(collapse_full, khugepaged_context, anon_ops); + TEST(collapse_full, khugepaged_context, file_ops); + TEST(collapse_full, khugepaged_context, shmem_ops); + TEST(collapse_full, madvise_context, anon_ops); + TEST(collapse_full, madvise_context, file_ops); + TEST(collapse_full, madvise_context, shmem_ops); + + TEST(collapse_empty, khugepaged_context, anon_ops); + TEST(collapse_empty, madvise_context, anon_ops); + + TEST(collapse_single_pte_entry, khugepaged_context, anon_ops); + TEST(collapse_single_pte_entry, khugepaged_context, file_ops); + TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops); + TEST(collapse_single_pte_entry, madvise_context, anon_ops); + TEST(collapse_single_pte_entry, madvise_context, file_ops); + TEST(collapse_single_pte_entry, madvise_context, shmem_ops); + + TEST(collapse_max_ptes_none, khugepaged_context, anon_ops); + TEST(collapse_max_ptes_none, khugepaged_context, file_ops); + TEST(collapse_max_ptes_none, madvise_context, anon_ops); + TEST(collapse_max_ptes_none, madvise_context, file_ops); + + TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops); + TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops); + TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops); + TEST(collapse_single_pte_entry_compound, madvise_context, file_ops); + + TEST(collapse_full_of_compound, khugepaged_context, anon_ops); + TEST(collapse_full_of_compound, khugepaged_context, file_ops); + TEST(collapse_full_of_compound, khugepaged_context, shmem_ops); + TEST(collapse_full_of_compound, madvise_context, anon_ops); + TEST(collapse_full_of_compound, madvise_context, file_ops); + TEST(collapse_full_of_compound, madvise_context, shmem_ops); + + TEST(collapse_compound_extreme, khugepaged_context, anon_ops); + TEST(collapse_compound_extreme, madvise_context, anon_ops); + + TEST(collapse_swapin_single_pte, khugepaged_context, anon_ops); + TEST(collapse_swapin_single_pte, madvise_context, anon_ops); + + TEST(collapse_max_ptes_swap, khugepaged_context, anon_ops); + TEST(collapse_max_ptes_swap, madvise_context, anon_ops); + + TEST(collapse_fork, khugepaged_context, anon_ops); + TEST(collapse_fork, madvise_context, anon_ops); + + TEST(collapse_fork_compound, khugepaged_context, anon_ops); + TEST(collapse_fork_compound, madvise_context, anon_ops); + + TEST(collapse_max_ptes_shared, khugepaged_context, anon_ops); + TEST(collapse_max_ptes_shared, madvise_context, anon_ops); + + TEST(madvise_collapse_existing_thps, madvise_context, anon_ops); + TEST(madvise_collapse_existing_thps, madvise_context, file_ops); + TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops); + + TEST(madvise_retracted_page_tables, madvise_context, file_ops); + TEST(madvise_retracted_page_tables, madvise_context, shmem_ops); restore_settings(0); } diff --git a/tools/testing/selftests/vm/mremap_test.c b/tools/testing/selftests/vm/mremap_test.c index db0270127aeb..9496346973d4 100644 --- a/tools/testing/selftests/vm/mremap_test.c +++ b/tools/testing/selftests/vm/mremap_test.c @@ -119,6 +119,50 @@ static unsigned long long get_mmap_min_addr(void) } /* + * This test validates that merge is called when expanding a mapping. + * Mapping containing three pages is created, middle page is unmapped + * and then the mapping containing the first page is expanded so that + * it fills the created hole. The two parts should merge creating + * single mapping with three pages. + */ +static void mremap_expand_merge(unsigned long page_size) +{ + char *test_name = "mremap expand merge"; + FILE *fp; + char *line = NULL; + size_t len = 0; + bool success = false; + char *start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + munmap(start + page_size, page_size); + mremap(start, page_size, 2 * page_size, 0); + + fp = fopen("/proc/self/maps", "r"); + if (fp == NULL) { + ksft_test_result_fail("%s\n", test_name); + return; + } + + while (getline(&line, &len, fp) != -1) { + char *first = strtok(line, "- "); + void *first_val = (void *)strtol(first, NULL, 16); + char *second = strtok(NULL, "- "); + void *second_val = (void *) strtol(second, NULL, 16); + + if (first_val == start && second_val == start + 3 * page_size) { + success = true; + break; + } + } + if (success) + ksft_test_result_pass("%s\n", test_name); + else + ksft_test_result_fail("%s\n", test_name); + fclose(fp); +} + +/* * Returns the start address of the mapping on success, else returns * NULL on failure. */ @@ -336,6 +380,7 @@ int main(int argc, char **argv) int i, run_perf_tests; unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD; unsigned int pattern_seed; + int num_expand_tests = 1; struct test test_cases[MAX_TEST]; struct test perf_test_cases[MAX_PERF_TEST]; int page_size; @@ -407,12 +452,14 @@ int main(int argc, char **argv) (threshold_mb * _1MB >= _1GB); ksft_set_plan(ARRAY_SIZE(test_cases) + (run_perf_tests ? - ARRAY_SIZE(perf_test_cases) : 0)); + ARRAY_SIZE(perf_test_cases) : 0) + num_expand_tests); for (i = 0; i < ARRAY_SIZE(test_cases); i++) run_mremap_test_case(test_cases[i], &failures, threshold_mb, pattern_seed); + mremap_expand_merge(page_size); + if (run_perf_tests) { ksft_print_msg("\n%s\n", "mremap HAVE_MOVE_PMD/PUD optimization time comparison for 1GB region:"); diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index de86983b8a0f..e780e76c26b8 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -120,11 +120,16 @@ run_test ./gup_test -a # Dump pages 0, 19, and 4096, using pin_user_pages: run_test ./gup_test -ct -F 0x1 0 19 0x1000 -run_test ./userfaultfd anon 20 16 -# Test requires source and destination huge pages. Size of source -# (half_ufd_size_MB) is passed as argument to test. -run_test ./userfaultfd hugetlb "$half_ufd_size_MB" 32 -run_test ./userfaultfd shmem 20 16 +uffd_mods=("" ":dev") +for mod in "${uffd_mods[@]}"; do + run_test ./userfaultfd anon${mod} 20 16 + # Hugetlb tests require source and destination huge pages. Pass in half + # the size ($half_ufd_size_MB), which is used for *each*. + run_test ./userfaultfd hugetlb${mod} "$half_ufd_size_MB" 32 + run_test ./userfaultfd hugetlb_shared${mod} "$half_ufd_size_MB" 32 "$mnt"/uffd-test + rm -f "$mnt"/uffd-test + run_test ./userfaultfd shmem${mod} 20 16 +done #cleanup umount "$mnt" diff --git a/tools/testing/selftests/vm/soft-dirty.c b/tools/testing/selftests/vm/soft-dirty.c index e3a43f5d4fa2..21d8830c5f24 100644 --- a/tools/testing/selftests/vm/soft-dirty.c +++ b/tools/testing/selftests/vm/soft-dirty.c @@ -91,7 +91,7 @@ static void test_hugepage(int pagemap_fd, int pagesize) for (i = 0; i < hpage_len; i++) map[i] = (char)i; - if (check_huge(map)) { + if (check_huge_anon(map, 1, hpage_len)) { ksft_test_result_pass("Test %s huge page allocation\n", __func__); clear_softdirty(); diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c index 6aa2b8253aed..76e1c36dd9e5 100644 --- a/tools/testing/selftests/vm/split_huge_page_test.c +++ b/tools/testing/selftests/vm/split_huge_page_test.c @@ -92,7 +92,6 @@ void split_pmd_thp(void) { char *one_page; size_t len = 4 * pmd_pagesize; - uint64_t thp_size; size_t i; one_page = memalign(pmd_pagesize, len); @@ -107,8 +106,7 @@ void split_pmd_thp(void) for (i = 0; i < len; i++) one_page[i] = (char)i; - thp_size = check_huge(one_page); - if (!thp_size) { + if (!check_huge_anon(one_page, 1, pmd_pagesize)) { printf("No THP is allocated\n"); exit(EXIT_FAILURE); } @@ -124,9 +122,8 @@ void split_pmd_thp(void) } - thp_size = check_huge(one_page); - if (thp_size) { - printf("Still %ld kB AnonHugePages not split\n", thp_size); + if (check_huge_anon(one_page, 0, pmd_pagesize)) { + printf("Still AnonHugePages not split\n"); exit(EXIT_FAILURE); } @@ -172,8 +169,7 @@ void split_pte_mapped_thp(void) for (i = 0; i < len; i++) one_page[i] = (char)i; - thp_size = check_huge(one_page); - if (!thp_size) { + if (!check_huge_anon(one_page, 1, pmd_pagesize)) { printf("No THP is allocated\n"); exit(EXIT_FAILURE); } diff --git a/tools/testing/selftests/vm/test_hmm.sh b/tools/testing/selftests/vm/test_hmm.sh index 539c9371e592..46e19b5d648d 100755 --- a/tools/testing/selftests/vm/test_hmm.sh +++ b/tools/testing/selftests/vm/test_hmm.sh @@ -52,21 +52,11 @@ load_driver() usage fi fi - if [ $? == 0 ]; then - major=$(awk "\$2==\"HMM_DMIRROR\" {print \$1}" /proc/devices) - mknod /dev/hmm_dmirror0 c $major 0 - mknod /dev/hmm_dmirror1 c $major 1 - if [ $# -eq 2 ]; then - mknod /dev/hmm_dmirror2 c $major 2 - mknod /dev/hmm_dmirror3 c $major 3 - fi - fi } unload_driver() { modprobe -r $DRIVER > /dev/null 2>&1 - rm -f /dev/hmm_dmirror? } run_smoke() diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 7c3f1b0ab468..297f250c1d95 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -61,10 +61,11 @@ #include <sys/random.h> #include "../kselftest.h" +#include "vm_util.h" #ifdef __NR_userfaultfd -static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size; +static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size; #define BOUNCE_RANDOM (1<<0) #define BOUNCE_RACINGFAULTS (1<<1) @@ -77,6 +78,13 @@ static int bounces; #define TEST_SHMEM 3 static int test_type; +#define UFFD_FLAGS (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY) + +#define BASE_PMD_ADDR ((void *)(1UL << 30)) + +/* test using /dev/userfaultfd, instead of userfaultfd(2) */ +static bool test_dev_userfaultfd; + /* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */ #define ALARM_INTERVAL_SECS 10 static volatile bool test_uffdio_copy_eexist = true; @@ -92,9 +100,10 @@ static int huge_fd; static unsigned long long *count_verify; static int uffd = -1; static int uffd_flags, finished, *pipefd; -static char *area_src, *area_src_alias, *area_dst, *area_dst_alias; +static char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; static char *zeropage; pthread_attr_t attr; +static bool test_collapse; /* Userfaultfd test statistics */ struct uffd_stats { @@ -122,9 +131,13 @@ struct uffd_stats { #define swap(a, b) \ do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) +#define factor_of_2(x) ((x) ^ ((x) & ((x) - 1))) + const char *examples = "# Run anonymous memory test on 100MiB region with 99999 bounces:\n" "./userfaultfd anon 100 99999\n\n" + "# Run the same anonymous memory test, but using /dev/userfaultfd:\n" + "./userfaultfd anon:dev 100 99999\n\n" "# Run share memory test on 1GiB region with 99 bounces:\n" "./userfaultfd shmem 1000 99\n\n" "# Run hugetlb memory test on 256MiB region with 50 bounces:\n" @@ -141,6 +154,16 @@ static void usage(void) "[hugetlbfs_file]\n\n"); fprintf(stderr, "Supported <test type>: anon, hugetlb, " "hugetlb_shared, shmem\n\n"); + fprintf(stderr, "'Test mods' can be joined to the test type string with a ':'. " + "Supported mods:\n"); + fprintf(stderr, "\tsyscall - Use userfaultfd(2) (default)\n"); + fprintf(stderr, "\tdev - Use /dev/userfaultfd instead of userfaultfd(2)\n"); + fprintf(stderr, "\tcollapse - Test MADV_COLLAPSE of UFFDIO_REGISTER_MODE_MINOR\n" + "memory\n"); + fprintf(stderr, "\nExample test mod usage:\n"); + fprintf(stderr, "# Run anonymous memory test with /dev/userfaultfd:\n"); + fprintf(stderr, "./userfaultfd anon:dev 100 99999\n\n"); + fprintf(stderr, "Examples:\n\n"); fprintf(stderr, "%s", examples); exit(1); @@ -154,12 +177,14 @@ static void usage(void) ret, __LINE__); \ } while (0) -#define err(fmt, ...) \ +#define errexit(exitcode, fmt, ...) \ do { \ _err(fmt, ##__VA_ARGS__); \ - exit(1); \ + exit(exitcode); \ } while (0) +#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__) + static void uffd_stats_reset(struct uffd_stats *uffd_stats, unsigned long n_cpus) { @@ -212,12 +237,10 @@ static void anon_release_pages(char *rel_area) err("madvise(MADV_DONTNEED) failed"); } -static void anon_allocate_area(void **alloc_area) +static void anon_allocate_area(void **alloc_area, bool is_src) { *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (*alloc_area == MAP_FAILED) - err("mmap of anonymous memory failed"); } static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) @@ -235,7 +258,7 @@ static void hugetlb_release_pages(char *rel_area) } } -static void hugetlb_allocate_area(void **alloc_area) +static void hugetlb_allocate_area(void **alloc_area, bool is_src) { void *area_alias = NULL; char **alloc_area_alias; @@ -245,7 +268,7 @@ static void hugetlb_allocate_area(void **alloc_area) nr_pages * page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | - (*alloc_area == area_src ? 0 : MAP_NORESERVE), + (is_src ? 0 : MAP_NORESERVE), -1, 0); else @@ -253,9 +276,9 @@ static void hugetlb_allocate_area(void **alloc_area) nr_pages * page_size, PROT_READ | PROT_WRITE, MAP_SHARED | - (*alloc_area == area_src ? 0 : MAP_NORESERVE), + (is_src ? 0 : MAP_NORESERVE), huge_fd, - *alloc_area == area_src ? 0 : nr_pages * page_size); + is_src ? 0 : nr_pages * page_size); if (*alloc_area == MAP_FAILED) err("mmap of hugetlbfs file failed"); @@ -265,12 +288,12 @@ static void hugetlb_allocate_area(void **alloc_area) PROT_READ | PROT_WRITE, MAP_SHARED, huge_fd, - *alloc_area == area_src ? 0 : nr_pages * page_size); + is_src ? 0 : nr_pages * page_size); if (area_alias == MAP_FAILED) err("mmap of hugetlb file alias failed"); } - if (*alloc_area == area_src) { + if (is_src) { alloc_area_alias = &area_src_alias; } else { alloc_area_alias = &area_dst_alias; @@ -293,21 +316,36 @@ static void shmem_release_pages(char *rel_area) err("madvise(MADV_REMOVE) failed"); } -static void shmem_allocate_area(void **alloc_area) +static void shmem_allocate_area(void **alloc_area, bool is_src) { void *area_alias = NULL; - bool is_src = alloc_area == (void **)&area_src; - unsigned long offset = is_src ? 0 : nr_pages * page_size; + size_t bytes = nr_pages * page_size; + unsigned long offset = is_src ? 0 : bytes; + char *p = NULL, *p_alias = NULL; + + if (test_collapse) { + p = BASE_PMD_ADDR; + if (!is_src) + /* src map + alias + interleaved hpages */ + p += 2 * (bytes + hpage_size); + p_alias = p; + p_alias += bytes; + p_alias += hpage_size; /* Prevent src/dst VMA merge */ + } - *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, - MAP_SHARED, shm_fd, offset); + *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, + shm_fd, offset); if (*alloc_area == MAP_FAILED) err("mmap of memfd failed"); + if (test_collapse && *alloc_area != p) + err("mmap of memfd failed at %p", p); - area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, - MAP_SHARED, shm_fd, offset); + area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, + shm_fd, offset); if (area_alias == MAP_FAILED) err("mmap of memfd alias failed"); + if (test_collapse && area_alias != p_alias) + err("mmap of anonymous memory failed at %p", p_alias); if (is_src) area_src_alias = area_alias; @@ -320,28 +358,39 @@ static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) *start = (unsigned long)area_dst_alias + offset; } +static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages) +{ + if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, hpage_size)) + err("Did not find expected %d number of hugepages", + expect_nr_hpages); +} + struct uffd_test_ops { - void (*allocate_area)(void **alloc_area); + void (*allocate_area)(void **alloc_area, bool is_src); void (*release_pages)(char *rel_area); void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset); + void (*check_pmd_mapping)(void *p, int expect_nr_hpages); }; static struct uffd_test_ops anon_uffd_test_ops = { .allocate_area = anon_allocate_area, .release_pages = anon_release_pages, .alias_mapping = noop_alias_mapping, + .check_pmd_mapping = NULL, }; static struct uffd_test_ops shmem_uffd_test_ops = { .allocate_area = shmem_allocate_area, .release_pages = shmem_release_pages, .alias_mapping = shmem_alias_mapping, + .check_pmd_mapping = shmem_check_pmd_mapping, }; static struct uffd_test_ops hugetlb_uffd_test_ops = { .allocate_area = hugetlb_allocate_area, .release_pages = hugetlb_release_pages, .alias_mapping = hugetlb_alias_mapping, + .check_pmd_mapping = NULL, }; static struct uffd_test_ops *uffd_test_ops; @@ -383,13 +432,34 @@ static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls) } } +static int __userfaultfd_open_dev(void) +{ + int fd, _uffd; + + fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); + if (fd < 0) + errexit(KSFT_SKIP, "opening /dev/userfaultfd failed"); + + _uffd = ioctl(fd, USERFAULTFD_IOC_NEW, UFFD_FLAGS); + if (_uffd < 0) + errexit(errno == ENOTTY ? KSFT_SKIP : 1, + "creating userfaultfd failed"); + close(fd); + return _uffd; +} + static void userfaultfd_open(uint64_t *features) { struct uffdio_api uffdio_api; - uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY); - if (uffd < 0) - err("userfaultfd syscall not available in this kernel"); + if (test_dev_userfaultfd) + uffd = __userfaultfd_open_dev(); + else { + uffd = syscall(__NR_userfaultfd, UFFD_FLAGS); + if (uffd < 0) + errexit(errno == ENOSYS ? KSFT_SKIP : 1, + "creating userfaultfd failed"); + } uffd_flags = fcntl(uffd, F_GETFD, NULL); uffdio_api.api = UFFD_API; @@ -440,6 +510,7 @@ static void uffd_test_ctx_clear(void) munmap_area((void **)&area_src_alias); munmap_area((void **)&area_dst); munmap_area((void **)&area_dst_alias); + munmap_area((void **)&area_remap); } static void uffd_test_ctx_init(uint64_t features) @@ -448,8 +519,8 @@ static void uffd_test_ctx_init(uint64_t features) uffd_test_ctx_clear(); - uffd_test_ops->allocate_area((void **)&area_src); - uffd_test_ops->allocate_area((void **)&area_dst); + uffd_test_ops->allocate_area((void **)&area_src, true); + uffd_test_ops->allocate_area((void **)&area_dst, false); userfaultfd_open(&features); @@ -703,7 +774,27 @@ static void uffd_handle_page_fault(struct uffd_msg *msg, continue_range(uffd, msg->arg.pagefault.address, page_size); stats->minor_faults++; } else { - /* Missing page faults */ + /* + * Missing page faults. + * + * Here we force a write check for each of the missing mode + * faults. It's guaranteed because the only threads that + * will trigger uffd faults are the locking threads, and + * their first instruction to touch the missing page will + * always be pthread_mutex_lock(). + * + * Note that here we relied on an NPTL glibc impl detail to + * always read the lock type at the entry of the lock op + * (pthread_mutex_t.__data.__type, offset 0x10) before + * doing any locking operations to guarantee that. It's + * actually not good to rely on this impl detail because + * logically a pthread-compatible lib can implement the + * locks without types and we can fail when linking with + * them. However since we used to find bugs with this + * strict check we still keep it around. Hopefully this + * could be a good hint when it fails again. If one day + * it'll break on some other impl of glibc we'll revisit. + */ if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) err("unexpected write fault"); @@ -766,6 +857,7 @@ static void *uffd_poll_thread(void *arg) err("remove failure"); break; case UFFD_EVENT_REMAP: + area_remap = area_dst; /* save for later unmap */ area_dst = (char *)(unsigned long)msg.arg.remap.to; break; } @@ -1218,13 +1310,30 @@ static int userfaultfd_sig_test(void) return userfaults != 0; } +void check_memory_contents(char *p) +{ + unsigned long i; + uint8_t expected_byte; + void *expected_page; + + if (posix_memalign(&expected_page, page_size, page_size)) + err("out of memory"); + + for (i = 0; i < nr_pages; ++i) { + expected_byte = ~((uint8_t)(i % ((uint8_t)-1))); + memset(expected_page, expected_byte, page_size); + if (my_bcmp(expected_page, p + (i * page_size), page_size)) + err("unexpected page contents after minor fault"); + } + + free(expected_page); +} + static int userfaultfd_minor_test(void) { - struct uffdio_register uffdio_register; unsigned long p; + struct uffdio_register uffdio_register; pthread_t uffd_mon; - uint8_t expected_byte; - void *expected_page; char c; struct uffd_stats stats = { 0 }; @@ -1263,17 +1372,7 @@ static int userfaultfd_minor_test(void) * fault. uffd_poll_thread will resolve the fault by bit-flipping the * page's contents, and then issuing a CONTINUE ioctl. */ - - if (posix_memalign(&expected_page, page_size, page_size)) - err("out of memory"); - - for (p = 0; p < nr_pages; ++p) { - expected_byte = ~((uint8_t)(p % ((uint8_t)-1))); - memset(expected_page, expected_byte, page_size); - if (my_bcmp(expected_page, area_dst_alias + (p * page_size), - page_size)) - err("unexpected page contents after minor fault"); - } + check_memory_contents(area_dst_alias); if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) err("pipe write"); @@ -1282,6 +1381,23 @@ static int userfaultfd_minor_test(void) uffd_stats_report(&stats, 1); + if (test_collapse) { + printf("testing collapse of uffd memory into PMD-mapped THPs:"); + if (madvise(area_dst_alias, nr_pages * page_size, + MADV_COLLAPSE)) + err("madvise(MADV_COLLAPSE)"); + + uffd_test_ops->check_pmd_mapping(area_dst, + nr_pages * page_size / + hpage_size); + /* + * This won't cause uffd-fault - it purely just makes sure there + * was no corruption. + */ + check_memory_contents(area_dst_alias); + printf(" done.\n"); + } + return stats.missing_faults != 0 || stats.minor_faults != nr_pages; } @@ -1584,8 +1700,6 @@ unsigned long default_huge_page_size(void) static void set_test_type(const char *type) { - uint64_t features = UFFD_API_FEATURES; - if (!strcmp(type, "anon")) { test_type = TEST_ANON; uffd_test_ops = &anon_uffd_test_ops; @@ -1603,12 +1717,37 @@ static void set_test_type(const char *type) test_type = TEST_SHMEM; uffd_test_ops = &shmem_uffd_test_ops; test_uffdio_minor = true; - } else { - err("Unknown test type: %s", type); } +} + +static void parse_test_type_arg(const char *raw_type) +{ + char *buf = strdup(raw_type); + uint64_t features = UFFD_API_FEATURES; + + while (buf) { + const char *token = strsep(&buf, ":"); + + if (!test_type) + set_test_type(token); + else if (!strcmp(token, "dev")) + test_dev_userfaultfd = true; + else if (!strcmp(token, "syscall")) + test_dev_userfaultfd = false; + else if (!strcmp(token, "collapse")) + test_collapse = true; + else + err("unrecognized test mod '%s'", token); + } + + if (!test_type) + err("failed to parse test type argument: '%s'", raw_type); + + if (test_collapse && test_type != TEST_SHMEM) + err("Unsupported test: %s", raw_type); if (test_type == TEST_HUGETLB) - page_size = default_huge_page_size(); + page_size = hpage_size; else page_size = sysconf(_SC_PAGE_SIZE); @@ -1646,6 +1785,8 @@ static void sigalrm(int sig) int main(int argc, char **argv) { + size_t bytes; + if (argc < 4) usage(); @@ -1653,11 +1794,41 @@ int main(int argc, char **argv) err("failed to arm SIGALRM"); alarm(ALARM_INTERVAL_SECS); - set_test_type(argv[1]); + hpage_size = default_huge_page_size(); + parse_test_type_arg(argv[1]); + bytes = atol(argv[2]) * 1024 * 1024; + + if (test_collapse && bytes & (hpage_size - 1)) + err("MiB must be multiple of %lu if :collapse mod set", + hpage_size >> 20); nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); - nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size / - nr_cpus; + + if (test_collapse) { + /* nr_cpus must divide (bytes / page_size), otherwise, + * area allocations of (nr_pages * paze_size) won't be a + * multiple of hpage_size, even if bytes is a multiple of + * hpage_size. + * + * This means that nr_cpus must divide (N * (2 << (H-P)) + * where: + * bytes = hpage_size * N + * hpage_size = 2 << H + * page_size = 2 << P + * + * And we want to chose nr_cpus to be the largest value + * satisfying this constraint, not larger than the number + * of online CPUs. Unfortunately, prime factorization of + * N and nr_cpus may be arbitrary, so have to search for it. + * Instead, just use the highest power of 2 dividing both + * nr_cpus and (bytes / page_size). + */ + int x = factor_of_2(nr_cpus); + int y = factor_of_2(bytes / page_size); + + nr_cpus = x < y ? x : y; + } + nr_pages_per_cpu = bytes / page_size / nr_cpus; if (!nr_pages_per_cpu) { _err("invalid MiB"); usage(); diff --git a/tools/testing/selftests/vm/vm_util.c b/tools/testing/selftests/vm/vm_util.c index b58ab11a7a30..f11f8adda521 100644 --- a/tools/testing/selftests/vm/vm_util.c +++ b/tools/testing/selftests/vm/vm_util.c @@ -42,9 +42,9 @@ void clear_softdirty(void) ksft_exit_fail_msg("writing clear_refs failed\n"); } -static bool check_for_pattern(FILE *fp, const char *pattern, char *buf) +bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len) { - while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) { + while (fgets(buf, len, fp)) { if (!strncmp(buf, pattern, strlen(pattern))) return true; } @@ -72,9 +72,10 @@ uint64_t read_pmd_pagesize(void) return strtoul(buf, NULL, 10); } -uint64_t check_huge(void *addr) +bool __check_huge(void *addr, char *pattern, int nr_hpages, + uint64_t hpage_size) { - uint64_t thp = 0; + uint64_t thp = -1; int ret; FILE *fp; char buffer[MAX_LINE_LENGTH]; @@ -89,20 +90,37 @@ uint64_t check_huge(void *addr) if (!fp) ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, SMAP_FILE_PATH); - if (!check_for_pattern(fp, addr_pattern, buffer)) + if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer))) goto err_out; /* - * Fetch the AnonHugePages: in the same block and check the number of + * Fetch the pattern in the same block and check the number of * hugepages. */ - if (!check_for_pattern(fp, "AnonHugePages:", buffer)) + if (!check_for_pattern(fp, pattern, buffer, sizeof(buffer))) goto err_out; - if (sscanf(buffer, "AnonHugePages:%10ld kB", &thp) != 1) + snprintf(addr_pattern, MAX_LINE_LENGTH, "%s%%9ld kB", pattern); + + if (sscanf(buffer, addr_pattern, &thp) != 1) ksft_exit_fail_msg("Reading smap error\n"); err_out: fclose(fp); - return thp; + return thp == (nr_hpages * (hpage_size >> 10)); +} + +bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size) +{ + return __check_huge(addr, "AnonHugePages: ", nr_hpages, hpage_size); +} + +bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size) +{ + return __check_huge(addr, "FilePmdMapped:", nr_hpages, hpage_size); +} + +bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size) +{ + return __check_huge(addr, "ShmemPmdMapped:", nr_hpages, hpage_size); } diff --git a/tools/testing/selftests/vm/vm_util.h b/tools/testing/selftests/vm/vm_util.h index 2e512bd57ae1..5c35de454e08 100644 --- a/tools/testing/selftests/vm/vm_util.h +++ b/tools/testing/selftests/vm/vm_util.h @@ -5,5 +5,8 @@ uint64_t pagemap_get_entry(int fd, char *start); bool pagemap_is_softdirty(int fd, char *start); void clear_softdirty(void); +bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len); uint64_t read_pmd_pagesize(void); -uint64_t check_huge(void *addr); +bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size); +bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size); +bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size); |