diff options
Diffstat (limited to 'tools/testing/selftests')
20 files changed, 1783 insertions, 108 deletions
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index b8f12e0897e6..89b05e2222c9 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -6,6 +6,7 @@ TARGETS += firmware TARGETS += ftrace TARGETS += futex TARGETS += kcmp +TARGETS += membarrier TARGETS += memfd TARGETS += memory-hotplug TARGETS += mount @@ -23,6 +24,7 @@ TARGETS += user TARGETS += jumplabel TARGETS += vm TARGETS += x86 +TARGETS += zram #Please keep the TARGETS list alphabetically sorted # Run "make quicktest=1 run_tests" or # "make quicktest=1 kselftest from top level Makefile @@ -72,7 +74,6 @@ ifdef INSTALL_PATH @# Ask all targets to install their files mkdir -p $(INSTALL_PATH) for TARGET in $(TARGETS); do \ - mkdir -p $(INSTALL_PATH)/$$TARGET ; \ make -C $$TARGET INSTALL_PATH=$(INSTALL_PATH)/$$TARGET install; \ done; diff --git a/tools/testing/selftests/breakpoints/Makefile b/tools/testing/selftests/breakpoints/Makefile index 182235640209..d27108b4f208 100644 --- a/tools/testing/selftests/breakpoints/Makefile +++ b/tools/testing/selftests/breakpoints/Makefile @@ -1,22 +1,12 @@ # Taken from perf makefile uname_M := $(shell uname -m 2>/dev/null || echo not) -ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/) -ifeq ($(ARCH),i386) - ARCH := x86 -endif -ifeq ($(ARCH),x86_64) - ARCH := x86 -endif +ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/) - -all: ifeq ($(ARCH),x86) - gcc breakpoint_test.c -o breakpoint_test -else - echo "Not an x86 target, can't build breakpoints selftests" +TEST_PROGS := breakpoint_test endif -TEST_PROGS := breakpoint_test +all: include ../lib.mk diff --git a/tools/testing/selftests/capabilities/.gitignore b/tools/testing/selftests/capabilities/.gitignore new file mode 100644 index 000000000000..b732dd0d4738 --- /dev/null +++ b/tools/testing/selftests/capabilities/.gitignore @@ -0,0 +1,2 @@ +test_execve +validate_cap diff --git a/tools/testing/selftests/capabilities/Makefile b/tools/testing/selftests/capabilities/Makefile new file mode 100644 index 000000000000..8c8f0c1f0889 --- /dev/null +++ b/tools/testing/selftests/capabilities/Makefile @@ -0,0 +1,18 @@ +all: + +include ../lib.mk + +.PHONY: all clean + +TARGETS := validate_cap test_execve +TEST_PROGS := test_execve + +CFLAGS := -O2 -g -std=gnu99 -Wall -lcap-ng + +all: $(TARGETS) + +clean: + $(RM) $(TARGETS) + +$(TARGETS): %: %.c + $(CC) -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl diff --git a/tools/testing/selftests/capabilities/test_execve.c b/tools/testing/selftests/capabilities/test_execve.c new file mode 100644 index 000000000000..10a21a958aaf --- /dev/null +++ b/tools/testing/selftests/capabilities/test_execve.c @@ -0,0 +1,427 @@ +#define _GNU_SOURCE + +#include <cap-ng.h> +#include <err.h> +#include <linux/capability.h> +#include <stdbool.h> +#include <string.h> +#include <stdio.h> +#include <fcntl.h> +#include <errno.h> +#include <stdarg.h> +#include <sched.h> +#include <sys/mount.h> +#include <limits.h> +#include <libgen.h> +#include <malloc.h> +#include <sys/wait.h> +#include <sys/prctl.h> +#include <sys/stat.h> + +#ifndef PR_CAP_AMBIENT +#define PR_CAP_AMBIENT 47 +# define PR_CAP_AMBIENT_IS_SET 1 +# define PR_CAP_AMBIENT_RAISE 2 +# define PR_CAP_AMBIENT_LOWER 3 +# define PR_CAP_AMBIENT_CLEAR_ALL 4 +#endif + +static int nerrs; + +static void vmaybe_write_file(bool enoent_ok, char *filename, char *fmt, va_list ap) +{ + char buf[4096]; + int fd; + ssize_t written; + int buf_len; + + buf_len = vsnprintf(buf, sizeof(buf), fmt, ap); + if (buf_len < 0) { + err(1, "vsnprintf failed"); + } + if (buf_len >= sizeof(buf)) { + errx(1, "vsnprintf output truncated"); + } + + fd = open(filename, O_WRONLY); + if (fd < 0) { + if ((errno == ENOENT) && enoent_ok) + return; + err(1, "open of %s failed", filename); + } + written = write(fd, buf, buf_len); + if (written != buf_len) { + if (written >= 0) { + errx(1, "short write to %s", filename); + } else { + err(1, "write to %s failed", filename); + } + } + if (close(fd) != 0) { + err(1, "close of %s failed", filename); + } +} + +static void maybe_write_file(char *filename, char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vmaybe_write_file(true, filename, fmt, ap); + va_end(ap); +} + +static void write_file(char *filename, char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vmaybe_write_file(false, filename, fmt, ap); + va_end(ap); +} + +static bool create_and_enter_ns(uid_t inner_uid) +{ + uid_t outer_uid; + gid_t outer_gid; + int i; + bool have_outer_privilege; + + outer_uid = getuid(); + outer_gid = getgid(); + + /* + * TODO: If we're already root, we could skip creating the userns. + */ + + if (unshare(CLONE_NEWNS) == 0) { + printf("[NOTE]\tUsing global UIDs for tests\n"); + if (prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0) != 0) + err(1, "PR_SET_KEEPCAPS"); + if (setresuid(inner_uid, inner_uid, -1) != 0) + err(1, "setresuid"); + + // Re-enable effective caps + capng_get_caps_process(); + for (i = 0; i < CAP_LAST_CAP; i++) + if (capng_have_capability(CAPNG_PERMITTED, i)) + capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, i); + if (capng_apply(CAPNG_SELECT_CAPS) != 0) + err(1, "capng_apply"); + + have_outer_privilege = true; + } else if (unshare(CLONE_NEWUSER | CLONE_NEWNS) == 0) { + printf("[NOTE]\tUsing a user namespace for tests\n"); + maybe_write_file("/proc/self/setgroups", "deny"); + write_file("/proc/self/uid_map", "%d %d 1", inner_uid, outer_uid); + write_file("/proc/self/gid_map", "0 %d 1", outer_gid); + + have_outer_privilege = false; + } else { + errx(1, "must be root or be able to create a userns"); + } + + if (mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL) != 0) + err(1, "remount everything private"); + + return have_outer_privilege; +} + +static void chdir_to_tmpfs(void) +{ + char cwd[PATH_MAX]; + if (getcwd(cwd, sizeof(cwd)) != cwd) + err(1, "getcwd"); + + if (mount("private_tmp", ".", "tmpfs", 0, "mode=0777") != 0) + err(1, "mount private tmpfs"); + + if (chdir(cwd) != 0) + err(1, "chdir to private tmpfs"); + + if (umount2(".", MNT_DETACH) != 0) + err(1, "detach private tmpfs"); +} + +static void copy_fromat_to(int fromfd, const char *fromname, const char *toname) +{ + int from = openat(fromfd, fromname, O_RDONLY); + if (from == -1) + err(1, "open copy source"); + + int to = open(toname, O_CREAT | O_WRONLY | O_EXCL, 0700); + + while (true) { + char buf[4096]; + ssize_t sz = read(from, buf, sizeof(buf)); + if (sz == 0) + break; + if (sz < 0) + err(1, "read"); + + if (write(to, buf, sz) != sz) + err(1, "write"); /* no short writes on tmpfs */ + } + + close(from); + close(to); +} + +static bool fork_wait(void) +{ + pid_t child = fork(); + if (child == 0) { + nerrs = 0; + return true; + } else if (child > 0) { + int status; + if (waitpid(child, &status, 0) != child || + !WIFEXITED(status)) { + printf("[FAIL]\tChild died\n"); + nerrs++; + } else if (WEXITSTATUS(status) != 0) { + printf("[FAIL]\tChild failed\n"); + nerrs++; + } else { + printf("[OK]\tChild succeeded\n"); + } + + return false; + } else { + err(1, "fork"); + } +} + +static void exec_other_validate_cap(const char *name, + bool eff, bool perm, bool inh, bool ambient) +{ + execl(name, name, (eff ? "1" : "0"), + (perm ? "1" : "0"), (inh ? "1" : "0"), (ambient ? "1" : "0"), + NULL); + err(1, "execl"); +} + +static void exec_validate_cap(bool eff, bool perm, bool inh, bool ambient) +{ + exec_other_validate_cap("./validate_cap", eff, perm, inh, ambient); +} + +static int do_tests(int uid, const char *our_path) +{ + bool have_outer_privilege = create_and_enter_ns(uid); + + int ourpath_fd = open(our_path, O_RDONLY | O_DIRECTORY); + if (ourpath_fd == -1) + err(1, "open '%s'", our_path); + + chdir_to_tmpfs(); + + copy_fromat_to(ourpath_fd, "validate_cap", "validate_cap"); + + if (have_outer_privilege) { + uid_t gid = getegid(); + + copy_fromat_to(ourpath_fd, "validate_cap", + "validate_cap_suidroot"); + if (chown("validate_cap_suidroot", 0, -1) != 0) + err(1, "chown"); + if (chmod("validate_cap_suidroot", S_ISUID | 0700) != 0) + err(1, "chmod"); + + copy_fromat_to(ourpath_fd, "validate_cap", + "validate_cap_suidnonroot"); + if (chown("validate_cap_suidnonroot", uid + 1, -1) != 0) + err(1, "chown"); + if (chmod("validate_cap_suidnonroot", S_ISUID | 0700) != 0) + err(1, "chmod"); + + copy_fromat_to(ourpath_fd, "validate_cap", + "validate_cap_sgidroot"); + if (chown("validate_cap_sgidroot", -1, 0) != 0) + err(1, "chown"); + if (chmod("validate_cap_sgidroot", S_ISGID | 0710) != 0) + err(1, "chmod"); + + copy_fromat_to(ourpath_fd, "validate_cap", + "validate_cap_sgidnonroot"); + if (chown("validate_cap_sgidnonroot", -1, gid + 1) != 0) + err(1, "chown"); + if (chmod("validate_cap_sgidnonroot", S_ISGID | 0710) != 0) + err(1, "chmod"); +} + + capng_get_caps_process(); + + /* Make sure that i starts out clear */ + capng_update(CAPNG_DROP, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE); + if (capng_apply(CAPNG_SELECT_CAPS) != 0) + err(1, "capng_apply"); + + if (uid == 0) { + printf("[RUN]\tRoot => ep\n"); + if (fork_wait()) + exec_validate_cap(true, true, false, false); + } else { + printf("[RUN]\tNon-root => no caps\n"); + if (fork_wait()) + exec_validate_cap(false, false, false, false); + } + + printf("[OK]\tCheck cap_ambient manipulation rules\n"); + + /* We should not be able to add ambient caps yet. */ + if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != -1 || errno != EPERM) { + if (errno == EINVAL) + printf("[FAIL]\tPR_CAP_AMBIENT_RAISE isn't supported\n"); + else + printf("[FAIL]\tPR_CAP_AMBIENT_RAISE should have failed eith EPERM on a non-inheritable cap\n"); + return 1; + } + printf("[OK]\tPR_CAP_AMBIENT_RAISE failed on non-inheritable cap\n"); + + capng_update(CAPNG_ADD, CAPNG_INHERITABLE, CAP_NET_RAW); + capng_update(CAPNG_DROP, CAPNG_PERMITTED, CAP_NET_RAW); + capng_update(CAPNG_DROP, CAPNG_EFFECTIVE, CAP_NET_RAW); + if (capng_apply(CAPNG_SELECT_CAPS) != 0) + err(1, "capng_apply"); + if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_RAW, 0, 0, 0) != -1 || errno != EPERM) { + printf("[FAIL]\tPR_CAP_AMBIENT_RAISE should have failed on a non-permitted cap\n"); + return 1; + } + printf("[OK]\tPR_CAP_AMBIENT_RAISE failed on non-permitted cap\n"); + + capng_update(CAPNG_ADD, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE); + if (capng_apply(CAPNG_SELECT_CAPS) != 0) + err(1, "capng_apply"); + if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) { + printf("[FAIL]\tPR_CAP_AMBIENT_RAISE should have succeeded\n"); + return 1; + } + printf("[OK]\tPR_CAP_AMBIENT_RAISE worked\n"); + + if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != 1) { + printf("[FAIL]\tPR_CAP_AMBIENT_IS_SET is broken\n"); + return 1; + } + + if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0, 0) != 0) + err(1, "PR_CAP_AMBIENT_CLEAR_ALL"); + + if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) { + printf("[FAIL]\tPR_CAP_AMBIENT_CLEAR_ALL didn't work\n"); + return 1; + } + + if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) + err(1, "PR_CAP_AMBIENT_RAISE"); + + capng_update(CAPNG_DROP, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE); + if (capng_apply(CAPNG_SELECT_CAPS) != 0) + err(1, "capng_apply"); + + if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) { + printf("[FAIL]\tDropping I should have dropped A\n"); + return 1; + } + + printf("[OK]\tBasic manipulation appears to work\n"); + + capng_update(CAPNG_ADD, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE); + if (capng_apply(CAPNG_SELECT_CAPS) != 0) + err(1, "capng_apply"); + if (uid == 0) { + printf("[RUN]\tRoot +i => eip\n"); + if (fork_wait()) + exec_validate_cap(true, true, true, false); + } else { + printf("[RUN]\tNon-root +i => i\n"); + if (fork_wait()) + exec_validate_cap(false, false, true, false); + } + + if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) + err(1, "PR_CAP_AMBIENT_RAISE"); + + printf("[RUN]\tUID %d +ia => eipa\n", uid); + if (fork_wait()) + exec_validate_cap(true, true, true, true); + + /* The remaining tests need real privilege */ + + if (!have_outer_privilege) { + printf("[SKIP]\tSUID/SGID tests (needs privilege)\n"); + goto done; + } + + if (uid == 0) { + printf("[RUN]\tRoot +ia, suidroot => eipa\n"); + if (fork_wait()) + exec_other_validate_cap("./validate_cap_suidroot", + true, true, true, true); + + printf("[RUN]\tRoot +ia, suidnonroot => ip\n"); + if (fork_wait()) + exec_other_validate_cap("./validate_cap_suidnonroot", + false, true, true, false); + + printf("[RUN]\tRoot +ia, sgidroot => eipa\n"); + if (fork_wait()) + exec_other_validate_cap("./validate_cap_sgidroot", + true, true, true, true); + + if (fork_wait()) { + printf("[RUN]\tRoot, gid != 0, +ia, sgidroot => eip\n"); + if (setresgid(1, 1, 1) != 0) + err(1, "setresgid"); + exec_other_validate_cap("./validate_cap_sgidroot", + true, true, true, false); + } + + printf("[RUN]\tRoot +ia, sgidnonroot => eip\n"); + if (fork_wait()) + exec_other_validate_cap("./validate_cap_sgidnonroot", + true, true, true, false); + } else { + printf("[RUN]\tNon-root +ia, sgidnonroot => i\n"); + exec_other_validate_cap("./validate_cap_sgidnonroot", + false, false, true, false); + + if (fork_wait()) { + printf("[RUN]\tNon-root +ia, sgidroot => i\n"); + if (setresgid(1, 1, 1) != 0) + err(1, "setresgid"); + exec_other_validate_cap("./validate_cap_sgidroot", + false, false, true, false); + } + } + +done: + return nerrs ? 1 : 0; +} + +int main(int argc, char **argv) +{ + char *tmp1, *tmp2, *our_path; + + /* Find our path */ + tmp1 = strdup(argv[0]); + if (!tmp1) + err(1, "strdup"); + tmp2 = dirname(tmp1); + our_path = strdup(tmp2); + if (!our_path) + err(1, "strdup"); + free(tmp1); + + if (fork_wait()) { + printf("[RUN]\t+++ Tests with uid == 0 +++\n"); + return do_tests(0, our_path); + } + + if (fork_wait()) { + printf("[RUN]\t+++ Tests with uid != 0 +++\n"); + return do_tests(1, our_path); + } + + return nerrs ? 1 : 0; +} diff --git a/tools/testing/selftests/capabilities/validate_cap.c b/tools/testing/selftests/capabilities/validate_cap.c new file mode 100644 index 000000000000..dd3c45f7b23c --- /dev/null +++ b/tools/testing/selftests/capabilities/validate_cap.c @@ -0,0 +1,73 @@ +#include <cap-ng.h> +#include <err.h> +#include <linux/capability.h> +#include <stdbool.h> +#include <string.h> +#include <stdio.h> +#include <sys/prctl.h> +#include <sys/auxv.h> + +#ifndef PR_CAP_AMBIENT +#define PR_CAP_AMBIENT 47 +# define PR_CAP_AMBIENT_IS_SET 1 +# define PR_CAP_AMBIENT_RAISE 2 +# define PR_CAP_AMBIENT_LOWER 3 +# define PR_CAP_AMBIENT_CLEAR_ALL 4 +#endif + +#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 19) +# define HAVE_GETAUXVAL +#endif + +static bool bool_arg(char **argv, int i) +{ + if (!strcmp(argv[i], "0")) + return false; + else if (!strcmp(argv[i], "1")) + return true; + else + errx(1, "wrong argv[%d]", i); +} + +int main(int argc, char **argv) +{ + const char *atsec = ""; + + /* + * Be careful just in case a setgid or setcapped copy of this + * helper gets out. + */ + + if (argc != 5) + errx(1, "wrong argc"); + +#ifdef HAVE_GETAUXVAL + if (getauxval(AT_SECURE)) + atsec = " (AT_SECURE is set)"; + else + atsec = " (AT_SECURE is not set)"; +#endif + + capng_get_caps_process(); + + if (capng_have_capability(CAPNG_EFFECTIVE, CAP_NET_BIND_SERVICE) != bool_arg(argv, 1)) { + printf("[FAIL]\tWrong effective state%s\n", atsec); + return 1; + } + if (capng_have_capability(CAPNG_PERMITTED, CAP_NET_BIND_SERVICE) != bool_arg(argv, 2)) { + printf("[FAIL]\tWrong permitted state%s\n", atsec); + return 1; + } + if (capng_have_capability(CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE) != bool_arg(argv, 3)) { + printf("[FAIL]\tWrong inheritable state%s\n", atsec); + return 1; + } + + if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != bool_arg(argv, 4)) { + printf("[FAIL]\tWrong ambient state%s\n", atsec); + return 1; + } + + printf("[OK]\tCapabilities after execve were correct\n"); + return 0; +} diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index ee412bab7ed4..97f1c6742066 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -12,11 +12,14 @@ run_tests: all $(RUN_TESTS) define INSTALL_RULE - mkdir -p $(INSTALL_PATH) - @for TEST_DIR in $(TEST_DIRS); do\ - cp -r $$TEST_DIR $(INSTALL_PATH); \ - done; - install -t $(INSTALL_PATH) $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES) + @if [ "X$(TEST_PROGS)$(TEST_PROGS_EXTENDED)$(TEST_FILES)" != "X" ]; then \ + mkdir -p $(INSTALL_PATH); \ + for TEST_DIR in $(TEST_DIRS); do \ + cp -r $$TEST_DIR $(INSTALL_PATH); \ + done; \ + echo "install -t $(INSTALL_PATH) $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES)"; \ + install -t $(INSTALL_PATH) $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES); \ + fi endef install: all diff --git a/tools/testing/selftests/membarrier/.gitignore b/tools/testing/selftests/membarrier/.gitignore new file mode 100644 index 000000000000..020c44f49a9e --- /dev/null +++ b/tools/testing/selftests/membarrier/.gitignore @@ -0,0 +1 @@ +membarrier_test diff --git a/tools/testing/selftests/membarrier/Makefile b/tools/testing/selftests/membarrier/Makefile new file mode 100644 index 000000000000..877a50355d7f --- /dev/null +++ b/tools/testing/selftests/membarrier/Makefile @@ -0,0 +1,11 @@ +CFLAGS += -g -I../../../../usr/include/ + +all: + $(CC) $(CFLAGS) membarrier_test.c -o membarrier_test + +TEST_PROGS := membarrier_test + +include ../lib.mk + +clean: + $(RM) membarrier_test diff --git a/tools/testing/selftests/membarrier/membarrier_test.c b/tools/testing/selftests/membarrier/membarrier_test.c new file mode 100644 index 000000000000..dde312508007 --- /dev/null +++ b/tools/testing/selftests/membarrier/membarrier_test.c @@ -0,0 +1,121 @@ +#define _GNU_SOURCE +#define __EXPORTED_HEADERS__ + +#include <linux/membarrier.h> +#include <asm-generic/unistd.h> +#include <sys/syscall.h> +#include <stdio.h> +#include <errno.h> +#include <string.h> + +#include "../kselftest.h" + +enum test_membarrier_status { + TEST_MEMBARRIER_PASS = 0, + TEST_MEMBARRIER_FAIL, + TEST_MEMBARRIER_SKIP, +}; + +static int sys_membarrier(int cmd, int flags) +{ + return syscall(__NR_membarrier, cmd, flags); +} + +static enum test_membarrier_status test_membarrier_cmd_fail(void) +{ + int cmd = -1, flags = 0; + + if (sys_membarrier(cmd, flags) != -1) { + printf("membarrier: Wrong command should fail but passed.\n"); + return TEST_MEMBARRIER_FAIL; + } + return TEST_MEMBARRIER_PASS; +} + +static enum test_membarrier_status test_membarrier_flags_fail(void) +{ + int cmd = MEMBARRIER_CMD_QUERY, flags = 1; + + if (sys_membarrier(cmd, flags) != -1) { + printf("membarrier: Wrong flags should fail but passed.\n"); + return TEST_MEMBARRIER_FAIL; + } + return TEST_MEMBARRIER_PASS; +} + +static enum test_membarrier_status test_membarrier_success(void) +{ + int cmd = MEMBARRIER_CMD_SHARED, flags = 0; + + if (sys_membarrier(cmd, flags) != 0) { + printf("membarrier: Executing MEMBARRIER_CMD_SHARED failed. %s.\n", + strerror(errno)); + return TEST_MEMBARRIER_FAIL; + } + + printf("membarrier: MEMBARRIER_CMD_SHARED success.\n"); + return TEST_MEMBARRIER_PASS; +} + +static enum test_membarrier_status test_membarrier(void) +{ + enum test_membarrier_status status; + + status = test_membarrier_cmd_fail(); + if (status) + return status; + status = test_membarrier_flags_fail(); + if (status) + return status; + status = test_membarrier_success(); + if (status) + return status; + return TEST_MEMBARRIER_PASS; +} + +static enum test_membarrier_status test_membarrier_query(void) +{ + int flags = 0, ret; + + printf("membarrier MEMBARRIER_CMD_QUERY "); + ret = sys_membarrier(MEMBARRIER_CMD_QUERY, flags); + if (ret < 0) { + printf("failed. %s.\n", strerror(errno)); + switch (errno) { + case ENOSYS: + /* + * It is valid to build a kernel with + * CONFIG_MEMBARRIER=n. However, this skips the tests. + */ + return TEST_MEMBARRIER_SKIP; + case EINVAL: + default: + return TEST_MEMBARRIER_FAIL; + } + } + if (!(ret & MEMBARRIER_CMD_SHARED)) { + printf("command MEMBARRIER_CMD_SHARED is not supported.\n"); + return TEST_MEMBARRIER_FAIL; + } + printf("syscall available.\n"); + return TEST_MEMBARRIER_PASS; +} + +int main(int argc, char **argv) +{ + switch (test_membarrier_query()) { + case TEST_MEMBARRIER_FAIL: + return ksft_exit_fail(); + case TEST_MEMBARRIER_SKIP: + return ksft_exit_skip(); + } + switch (test_membarrier()) { + case TEST_MEMBARRIER_FAIL: + return ksft_exit_fail(); + case TEST_MEMBARRIER_SKIP: + return ksft_exit_skip(); + } + + printf("membarrier: tests done!\n"); + return ksft_exit_pass(); +} diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 231b9a031f6a..d36fab7d8ebd 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -4,14 +4,16 @@ CFLAGS = -Wall BINARIES = compaction_test BINARIES += hugepage-mmap BINARIES += hugepage-shm -BINARIES += hugetlbfstest BINARIES += map_hugetlb BINARIES += thuge-gen BINARIES += transhuge-stress +BINARIES += userfaultfd all: $(BINARIES) %: %.c $(CC) $(CFLAGS) -o $@ $^ -lrt +userfaultfd: userfaultfd.c + $(CC) $(CFLAGS) -O2 -o $@ $^ -lpthread TEST_PROGS := run_vmtests TEST_FILES := $(BINARIES) diff --git a/tools/testing/selftests/vm/hugetlbfstest.c b/tools/testing/selftests/vm/hugetlbfstest.c deleted file mode 100644 index 02e1072ec187..000000000000 --- a/tools/testing/selftests/vm/hugetlbfstest.c +++ /dev/null @@ -1,86 +0,0 @@ -#define _GNU_SOURCE -#include <assert.h> -#include <fcntl.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/mman.h> -#include <sys/stat.h> -#include <sys/types.h> -#include <unistd.h> - -typedef unsigned long long u64; - -static size_t length = 1 << 24; - -static u64 read_rss(void) -{ - char buf[4096], *s = buf; - int i, fd; - u64 rss; - - fd = open("/proc/self/statm", O_RDONLY); - assert(fd > 2); - memset(buf, 0, sizeof(buf)); - read(fd, buf, sizeof(buf) - 1); - for (i = 0; i < 1; i++) - s = strchr(s, ' ') + 1; - rss = strtoull(s, NULL, 10); - return rss << 12; /* assumes 4k pagesize */ -} - -static void do_mmap(int fd, int extra_flags, int unmap) -{ - int *p; - int flags = MAP_PRIVATE | MAP_POPULATE | extra_flags; - u64 before, after; - int ret; - - before = read_rss(); - p = mmap(NULL, length, PROT_READ | PROT_WRITE, flags, fd, 0); - assert(p != MAP_FAILED || - !"mmap returned an unexpected error"); - after = read_rss(); - assert(llabs(after - before - length) < 0x40000 || - !"rss didn't grow as expected"); - if (!unmap) - return; - ret = munmap(p, length); - assert(!ret || !"munmap returned an unexpected error"); - after = read_rss(); - assert(llabs(after - before) < 0x40000 || - !"rss didn't shrink as expected"); -} - -static int open_file(const char *path) -{ - int fd, err; - - unlink(path); - fd = open(path, O_CREAT | O_RDWR | O_TRUNC | O_EXCL - | O_LARGEFILE | O_CLOEXEC, 0600); - assert(fd > 2); - unlink(path); - err = ftruncate(fd, length); - assert(!err); - return fd; -} - -int main(void) -{ - int hugefd, fd; - - fd = open_file("/dev/shm/hugetlbhog"); - hugefd = open_file("/hugepages/hugetlbhog"); - - system("echo 100 > /proc/sys/vm/nr_hugepages"); - do_mmap(-1, MAP_ANONYMOUS, 1); - do_mmap(fd, 0, 1); - do_mmap(-1, MAP_ANONYMOUS | MAP_HUGETLB, 1); - do_mmap(hugefd, 0, 1); - do_mmap(hugefd, MAP_HUGETLB, 1); - /* Leak the last one to test do_exit() */ - do_mmap(-1, MAP_ANONYMOUS | MAP_HUGETLB, 0); - printf("oll korrekt.\n"); - return 0; -} diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests index 49ece11ff7fd..9179ce8df485 100755 --- a/tools/testing/selftests/vm/run_vmtests +++ b/tools/testing/selftests/vm/run_vmtests @@ -75,10 +75,14 @@ else echo "[PASS]" fi +echo "NOTE: The above hugetlb tests provide minimal coverage. Use" +echo " https://github.com/libhugetlbfs/libhugetlbfs.git for" +echo " hugetlb regression testing." + echo "--------------------" -echo "running hugetlbfstest" +echo "running userfaultfd" echo "--------------------" -./hugetlbfstest +./userfaultfd 128 32 if [ $? -ne 0 ]; then echo "[FAIL]" exitcode=1 diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c new file mode 100644 index 000000000000..2c7cca6f26a4 --- /dev/null +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -0,0 +1,639 @@ +/* + * Stress userfaultfd syscall. + * + * Copyright (C) 2015 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * This test allocates two virtual areas and bounces the physical + * memory across the two virtual areas (from area_src to area_dst) + * using userfaultfd. + * + * There are three threads running per CPU: + * + * 1) one per-CPU thread takes a per-page pthread_mutex in a random + * page of the area_dst (while the physical page may still be in + * area_src), and increments a per-page counter in the same page, + * and checks its value against a verification region. + * + * 2) another per-CPU thread handles the userfaults generated by + * thread 1 above. userfaultfd blocking reads or poll() modes are + * exercised interleaved. + * + * 3) one last per-CPU thread transfers the memory in the background + * at maximum bandwidth (if not already transferred by thread + * 2). Each cpu thread takes cares of transferring a portion of the + * area. + * + * When all threads of type 3 completed the transfer, one bounce is + * complete. area_src and area_dst are then swapped. All threads are + * respawned and so the bounce is immediately restarted in the + * opposite direction. + * + * per-CPU threads 1 by triggering userfaults inside + * pthread_mutex_lock will also verify the atomicity of the memory + * transfer (UFFDIO_COPY). + * + * The program takes two parameters: the amounts of physical memory in + * megabytes (MiB) of the area and the number of bounces to execute. + * + * # 100MiB 99999 bounces + * ./userfaultfd 100 99999 + * + * # 1GiB 99 bounces + * ./userfaultfd 1000 99 + * + * # 10MiB-~6GiB 999 bounces, continue forever unless an error triggers + * while ./userfaultfd $[RANDOM % 6000 + 10] 999; do true; done + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <errno.h> +#include <unistd.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <time.h> +#include <signal.h> +#include <poll.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/syscall.h> +#include <sys/ioctl.h> +#include <pthread.h> +#include "../../../../include/uapi/linux/userfaultfd.h" + +#ifdef __x86_64__ +#define __NR_userfaultfd 323 +#elif defined(__i386__) +#define __NR_userfaultfd 374 +#elif defined(__powewrpc__) +#define __NR_userfaultfd 364 +#else +#error "missing __NR_userfaultfd definition" +#endif + +static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size; + +#define BOUNCE_RANDOM (1<<0) +#define BOUNCE_RACINGFAULTS (1<<1) +#define BOUNCE_VERIFY (1<<2) +#define BOUNCE_POLL (1<<3) +static int bounces; + +static unsigned long long *count_verify; +static int uffd, finished, *pipefd; +static char *area_src, *area_dst; +static char *zeropage; +pthread_attr_t attr; + +/* pthread_mutex_t starts at page offset 0 */ +#define area_mutex(___area, ___nr) \ + ((pthread_mutex_t *) ((___area) + (___nr)*page_size)) +/* + * count is placed in the page after pthread_mutex_t naturally aligned + * to avoid non alignment faults on non-x86 archs. + */ +#define area_count(___area, ___nr) \ + ((volatile unsigned long long *) ((unsigned long) \ + ((___area) + (___nr)*page_size + \ + sizeof(pthread_mutex_t) + \ + sizeof(unsigned long long) - 1) & \ + ~(unsigned long)(sizeof(unsigned long long) \ + - 1))) + +static int my_bcmp(char *str1, char *str2, size_t n) +{ + unsigned long i; + for (i = 0; i < n; i++) + if (str1[i] != str2[i]) + return 1; + return 0; +} + +static void *locking_thread(void *arg) +{ + unsigned long cpu = (unsigned long) arg; + struct random_data rand; + unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */ + int32_t rand_nr; + unsigned long long count; + char randstate[64]; + unsigned int seed; + time_t start; + + if (bounces & BOUNCE_RANDOM) { + seed = (unsigned int) time(NULL) - bounces; + if (!(bounces & BOUNCE_RACINGFAULTS)) + seed += cpu; + bzero(&rand, sizeof(rand)); + bzero(&randstate, sizeof(randstate)); + if (initstate_r(seed, randstate, sizeof(randstate), &rand)) + fprintf(stderr, "srandom_r error\n"), exit(1); + } else { + page_nr = -bounces; + if (!(bounces & BOUNCE_RACINGFAULTS)) + page_nr += cpu * nr_pages_per_cpu; + } + + while (!finished) { + if (bounces & BOUNCE_RANDOM) { + if (random_r(&rand, &rand_nr)) + fprintf(stderr, "random_r 1 error\n"), exit(1); + page_nr = rand_nr; + if (sizeof(page_nr) > sizeof(rand_nr)) { + if (random_r(&rand, &rand_nr)) + fprintf(stderr, "random_r 2 error\n"), exit(1); + page_nr |= (((unsigned long) rand_nr) << 16) << + 16; + } + } else + page_nr += 1; + page_nr %= nr_pages; + + start = time(NULL); + if (bounces & BOUNCE_VERIFY) { + count = *area_count(area_dst, page_nr); + if (!count) + fprintf(stderr, + "page_nr %lu wrong count %Lu %Lu\n", + page_nr, count, + count_verify[page_nr]), exit(1); + + + /* + * We can't use bcmp (or memcmp) because that + * returns 0 erroneously if the memory is + * changing under it (even if the end of the + * page is never changing and always + * different). + */ +#if 1 + if (!my_bcmp(area_dst + page_nr * page_size, zeropage, + page_size)) + fprintf(stderr, + "my_bcmp page_nr %lu wrong count %Lu %Lu\n", + page_nr, count, + count_verify[page_nr]), exit(1); +#else + unsigned long loops; + + loops = 0; + /* uncomment the below line to test with mutex */ + /* pthread_mutex_lock(area_mutex(area_dst, page_nr)); */ + while (!bcmp(area_dst + page_nr * page_size, zeropage, + page_size)) { + loops += 1; + if (loops > 10) + break; + } + /* uncomment below line to test with mutex */ + /* pthread_mutex_unlock(area_mutex(area_dst, page_nr)); */ + if (loops) { + fprintf(stderr, + "page_nr %lu all zero thread %lu %p %lu\n", + page_nr, cpu, area_dst + page_nr * page_size, + loops); + if (loops > 10) + exit(1); + } +#endif + } + + pthread_mutex_lock(area_mutex(area_dst, page_nr)); + count = *area_count(area_dst, page_nr); + if (count != count_verify[page_nr]) { + fprintf(stderr, + "page_nr %lu memory corruption %Lu %Lu\n", + page_nr, count, + count_verify[page_nr]), exit(1); + } + count++; + *area_count(area_dst, page_nr) = count_verify[page_nr] = count; + pthread_mutex_unlock(area_mutex(area_dst, page_nr)); + + if (time(NULL) - start > 1) + fprintf(stderr, + "userfault too slow %ld " + "possible false positive with overcommit\n", + time(NULL) - start); + } + + return NULL; +} + +static int copy_page(unsigned long offset) +{ + struct uffdio_copy uffdio_copy; + + if (offset >= nr_pages * page_size) + fprintf(stderr, "unexpected offset %lu\n", + offset), exit(1); + uffdio_copy.dst = (unsigned long) area_dst + offset; + uffdio_copy.src = (unsigned long) area_src + offset; + uffdio_copy.len = page_size; + uffdio_copy.mode = 0; + uffdio_copy.copy = 0; + if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy)) { + /* real retval in ufdio_copy.copy */ + if (uffdio_copy.copy != -EEXIST) + fprintf(stderr, "UFFDIO_COPY error %Ld\n", + uffdio_copy.copy), exit(1); + } else if (uffdio_copy.copy != page_size) { + fprintf(stderr, "UFFDIO_COPY unexpected copy %Ld\n", + uffdio_copy.copy), exit(1); + } else + return 1; + return 0; +} + +static void *uffd_poll_thread(void *arg) +{ + unsigned long cpu = (unsigned long) arg; + struct pollfd pollfd[2]; + struct uffd_msg msg; + int ret; + unsigned long offset; + char tmp_chr; + unsigned long userfaults = 0; + + pollfd[0].fd = uffd; + pollfd[0].events = POLLIN; + pollfd[1].fd = pipefd[cpu*2]; + pollfd[1].events = POLLIN; + + for (;;) { + ret = poll(pollfd, 2, -1); + if (!ret) + fprintf(stderr, "poll error %d\n", ret), exit(1); + if (ret < 0) + perror("poll"), exit(1); + if (pollfd[1].revents & POLLIN) { + if (read(pollfd[1].fd, &tmp_chr, 1) != 1) + fprintf(stderr, "read pipefd error\n"), + exit(1); + break; + } + if (!(pollfd[0].revents & POLLIN)) + fprintf(stderr, "pollfd[0].revents %d\n", + pollfd[0].revents), exit(1); + ret = read(uffd, &msg, sizeof(msg)); + if (ret < 0) { + if (errno == EAGAIN) + continue; + perror("nonblocking read error"), exit(1); + } + if (msg.event != UFFD_EVENT_PAGEFAULT) + fprintf(stderr, "unexpected msg event %u\n", + msg.event), exit(1); + if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) + fprintf(stderr, "unexpected write fault\n"), exit(1); + offset = (char *)(unsigned long)msg.arg.pagefault.address - + area_dst; + offset &= ~(page_size-1); + if (copy_page(offset)) + userfaults++; + } + return (void *)userfaults; +} + +pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER; + +static void *uffd_read_thread(void *arg) +{ + unsigned long *this_cpu_userfaults; + struct uffd_msg msg; + unsigned long offset; + int ret; + + this_cpu_userfaults = (unsigned long *) arg; + *this_cpu_userfaults = 0; + + pthread_mutex_unlock(&uffd_read_mutex); + /* from here cancellation is ok */ + + for (;;) { + ret = read(uffd, &msg, sizeof(msg)); + if (ret != sizeof(msg)) { + if (ret < 0) + perror("blocking read error"), exit(1); + else + fprintf(stderr, "short read\n"), exit(1); + } + if (msg.event != UFFD_EVENT_PAGEFAULT) + fprintf(stderr, "unexpected msg event %u\n", + msg.event), exit(1); + if (bounces & BOUNCE_VERIFY && + msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) + fprintf(stderr, "unexpected write fault\n"), exit(1); + offset = (char *)(unsigned long)msg.arg.pagefault.address - + area_dst; + offset &= ~(page_size-1); + if (copy_page(offset)) + (*this_cpu_userfaults)++; + } + return (void *)NULL; +} + +static void *background_thread(void *arg) +{ + unsigned long cpu = (unsigned long) arg; + unsigned long page_nr; + + for (page_nr = cpu * nr_pages_per_cpu; + page_nr < (cpu+1) * nr_pages_per_cpu; + page_nr++) + copy_page(page_nr * page_size); + + return NULL; +} + +static int stress(unsigned long *userfaults) +{ + unsigned long cpu; + pthread_t locking_threads[nr_cpus]; + pthread_t uffd_threads[nr_cpus]; + pthread_t background_threads[nr_cpus]; + void **_userfaults = (void **) userfaults; + + finished = 0; + for (cpu = 0; cpu < nr_cpus; cpu++) { + if (pthread_create(&locking_threads[cpu], &attr, + locking_thread, (void *)cpu)) + return 1; + if (bounces & BOUNCE_POLL) { + if (pthread_create(&uffd_threads[cpu], &attr, + uffd_poll_thread, (void *)cpu)) + return 1; + } else { + if (pthread_create(&uffd_threads[cpu], &attr, + uffd_read_thread, + &_userfaults[cpu])) + return 1; + pthread_mutex_lock(&uffd_read_mutex); + } + if (pthread_create(&background_threads[cpu], &attr, + background_thread, (void *)cpu)) + return 1; + } + for (cpu = 0; cpu < nr_cpus; cpu++) + if (pthread_join(background_threads[cpu], NULL)) + return 1; + + /* + * Be strict and immediately zap area_src, the whole area has + * been transferred already by the background treads. The + * area_src could then be faulted in in a racy way by still + * running uffdio_threads reading zeropages after we zapped + * area_src (but they're guaranteed to get -EEXIST from + * UFFDIO_COPY without writing zero pages into area_dst + * because the background threads already completed). + */ + if (madvise(area_src, nr_pages * page_size, MADV_DONTNEED)) { + perror("madvise"); + return 1; + } + + for (cpu = 0; cpu < nr_cpus; cpu++) { + char c; + if (bounces & BOUNCE_POLL) { + if (write(pipefd[cpu*2+1], &c, 1) != 1) { + fprintf(stderr, "pipefd write error\n"); + return 1; + } + if (pthread_join(uffd_threads[cpu], &_userfaults[cpu])) + return 1; + } else { + if (pthread_cancel(uffd_threads[cpu])) + return 1; + if (pthread_join(uffd_threads[cpu], NULL)) + return 1; + } + } + + finished = 1; + for (cpu = 0; cpu < nr_cpus; cpu++) + if (pthread_join(locking_threads[cpu], NULL)) + return 1; + + return 0; +} + +static int userfaultfd_stress(void) +{ + void *area; + char *tmp_area; + unsigned long nr; + struct uffdio_register uffdio_register; + struct uffdio_api uffdio_api; + unsigned long cpu; + int uffd_flags; + unsigned long userfaults[nr_cpus]; + + if (posix_memalign(&area, page_size, nr_pages * page_size)) { + fprintf(stderr, "out of memory\n"); + return 1; + } + area_src = area; + if (posix_memalign(&area, page_size, nr_pages * page_size)) { + fprintf(stderr, "out of memory\n"); + return 1; + } + area_dst = area; + + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + if (uffd < 0) { + fprintf(stderr, + "userfaultfd syscall not available in this kernel\n"); + return 1; + } + uffd_flags = fcntl(uffd, F_GETFD, NULL); + + uffdio_api.api = UFFD_API; + uffdio_api.features = 0; + if (ioctl(uffd, UFFDIO_API, &uffdio_api)) { + fprintf(stderr, "UFFDIO_API\n"); + return 1; + } + if (uffdio_api.api != UFFD_API) { + fprintf(stderr, "UFFDIO_API error %Lu\n", uffdio_api.api); + return 1; + } + + count_verify = malloc(nr_pages * sizeof(unsigned long long)); + if (!count_verify) { + perror("count_verify"); + return 1; + } + + for (nr = 0; nr < nr_pages; nr++) { + *area_mutex(area_src, nr) = (pthread_mutex_t) + PTHREAD_MUTEX_INITIALIZER; + count_verify[nr] = *area_count(area_src, nr) = 1; + } + + pipefd = malloc(sizeof(int) * nr_cpus * 2); + if (!pipefd) { + perror("pipefd"); + return 1; + } + for (cpu = 0; cpu < nr_cpus; cpu++) { + if (pipe2(&pipefd[cpu*2], O_CLOEXEC | O_NONBLOCK)) { + perror("pipe"); + return 1; + } + } + + if (posix_memalign(&area, page_size, page_size)) { + fprintf(stderr, "out of memory\n"); + return 1; + } + zeropage = area; + bzero(zeropage, page_size); + + pthread_mutex_lock(&uffd_read_mutex); + + pthread_attr_init(&attr); + pthread_attr_setstacksize(&attr, 16*1024*1024); + + while (bounces--) { + unsigned long expected_ioctls; + + printf("bounces: %d, mode:", bounces); + if (bounces & BOUNCE_RANDOM) + printf(" rnd"); + if (bounces & BOUNCE_RACINGFAULTS) + printf(" racing"); + if (bounces & BOUNCE_VERIFY) + printf(" ver"); + if (bounces & BOUNCE_POLL) + printf(" poll"); + printf(", "); + fflush(stdout); + + if (bounces & BOUNCE_POLL) + fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); + else + fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK); + + /* register */ + uffdio_register.range.start = (unsigned long) area_dst; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) { + fprintf(stderr, "register failure\n"); + return 1; + } + expected_ioctls = (1 << _UFFDIO_WAKE) | + (1 << _UFFDIO_COPY) | + (1 << _UFFDIO_ZEROPAGE); + if ((uffdio_register.ioctls & expected_ioctls) != + expected_ioctls) { + fprintf(stderr, + "unexpected missing ioctl for anon memory\n"); + return 1; + } + + /* + * The madvise done previously isn't enough: some + * uffd_thread could have read userfaults (one of + * those already resolved by the background thread) + * and it may be in the process of calling + * UFFDIO_COPY. UFFDIO_COPY will read the zapped + * area_src and it would map a zero page in it (of + * course such a UFFDIO_COPY is perfectly safe as it'd + * return -EEXIST). The problem comes at the next + * bounce though: that racing UFFDIO_COPY would + * generate zeropages in the area_src, so invalidating + * the previous MADV_DONTNEED. Without this additional + * MADV_DONTNEED those zeropages leftovers in the + * area_src would lead to -EEXIST failure during the + * next bounce, effectively leaving a zeropage in the + * area_dst. + * + * Try to comment this out madvise to see the memory + * corruption being caught pretty quick. + * + * khugepaged is also inhibited to collapse THP after + * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's + * required to MADV_DONTNEED here. + */ + if (madvise(area_dst, nr_pages * page_size, MADV_DONTNEED)) { + perror("madvise 2"); + return 1; + } + + /* bounce pass */ + if (stress(userfaults)) + return 1; + + /* unregister */ + if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) { + fprintf(stderr, "register failure\n"); + return 1; + } + + /* verification */ + if (bounces & BOUNCE_VERIFY) { + for (nr = 0; nr < nr_pages; nr++) { + if (my_bcmp(area_dst, + area_dst + nr * page_size, + sizeof(pthread_mutex_t))) { + fprintf(stderr, + "error mutex 2 %lu\n", + nr); + bounces = 0; + } + if (*area_count(area_dst, nr) != count_verify[nr]) { + fprintf(stderr, + "error area_count %Lu %Lu %lu\n", + *area_count(area_src, nr), + count_verify[nr], + nr); + bounces = 0; + } + } + } + + /* prepare next bounce */ + tmp_area = area_src; + area_src = area_dst; + area_dst = tmp_area; + + printf("userfaults:"); + for (cpu = 0; cpu < nr_cpus; cpu++) + printf(" %lu", userfaults[cpu]); + printf("\n"); + } + + return 0; +} + +int main(int argc, char **argv) +{ + if (argc < 3) + fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1); + nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + page_size = sysconf(_SC_PAGE_SIZE); + if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) > + page_size) + fprintf(stderr, "Impossible to run this test\n"), exit(2); + nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size / + nr_cpus; + if (!nr_pages_per_cpu) { + fprintf(stderr, "invalid MiB\n"); + fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1); + } + bounces = atoi(argv[2]); + if (bounces <= 0) { + fprintf(stderr, "invalid bounces\n"); + fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1); + } + nr_pages = nr_pages_per_cpu * nr_cpus; + printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n", + nr_pages, nr_pages_per_cpu); + return userfaultfd_stress(); +} diff --git a/tools/testing/selftests/zram/Makefile b/tools/testing/selftests/zram/Makefile new file mode 100644 index 000000000000..29d80346e3eb --- /dev/null +++ b/tools/testing/selftests/zram/Makefile @@ -0,0 +1,9 @@ +all: + +TEST_PROGS := zram.sh +TEST_FILES := zram01.sh zram02.sh zram_lib.sh + +include ../lib.mk + +clean: + $(RM) err.log diff --git a/tools/testing/selftests/zram/README b/tools/testing/selftests/zram/README new file mode 100644 index 000000000000..eb17917c8a3a --- /dev/null +++ b/tools/testing/selftests/zram/README @@ -0,0 +1,40 @@ +zram: Compressed RAM based block devices +---------------------------------------- +* Introduction + +The zram module creates RAM based block devices named /dev/zram<id> +(<id> = 0, 1, ...). Pages written to these disks are compressed and stored +in memory itself. These disks allow very fast I/O and compression provides +good amounts of memory savings. Some of the usecases include /tmp storage, +use as swap disks, various caches under /var and maybe many more :) + +Statistics for individual zram devices are exported through sysfs nodes at +/sys/block/zram<id>/ + +Kconfig required: +CONFIG_ZRAM=y +CONFIG_ZRAM_LZ4_COMPRESS=y +CONFIG_ZPOOL=y +CONFIG_ZSMALLOC=y + +ZRAM Testcases +-------------- +zram_lib.sh: create library with initialization/cleanup functions +zram.sh: For sanity check of CONFIG_ZRAM and to run zram01 and zram02 + +Two functional tests: zram01 and zram02: +zram01.sh: creates general purpose ram disks with ext4 filesystems +zram02.sh: creates block device for swap + +Commands required for testing: + - bc + - dd + - free + - awk + - mkswap + - swapon + - swapoff + - mkfs/ mkfs.ext4 + +For more information please refer: +kernel-source-tree/Documentation/blockdev/zram.txt diff --git a/tools/testing/selftests/zram/zram.sh b/tools/testing/selftests/zram/zram.sh new file mode 100755 index 000000000000..20de9a761269 --- /dev/null +++ b/tools/testing/selftests/zram/zram.sh @@ -0,0 +1,35 @@ +#!/bin/bash +TCID="zram.sh" + +check_prereqs() +{ + local msg="skip all tests:" + + if [ $UID != 0 ]; then + echo $msg must be run as root >&2 + exit 0 + fi +} + +run_zram () { +echo "--------------------" +echo "running zram tests" +echo "--------------------" +./zram01.sh +echo "" +./zram02.sh +} + +check_prereqs + +# check zram module exists +MODULE_PATH=/lib/modules/`uname -r`/kernel/drivers/block/zram/zram.ko +if [ -f $MODULE_PATH ]; then + run_zram +elif [ -b /dev/zram0 ]; then + run_zram +else + echo "$TCID : No zram.ko module or /dev/zram0 device file not found" + echo "$TCID : CONFIG_ZRAM is not set" + exit 1 +fi diff --git a/tools/testing/selftests/zram/zram01.sh b/tools/testing/selftests/zram/zram01.sh new file mode 100755 index 000000000000..b9566a6478a9 --- /dev/null +++ b/tools/testing/selftests/zram/zram01.sh @@ -0,0 +1,99 @@ +#!/bin/bash +# Copyright (c) 2015 Oracle and/or its affiliates. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of +# the License, or (at your option) any later version. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# Test creates several zram devices with different filesystems on them. +# It fills each device with zeros and checks that compression works. +# +# Author: Alexey Kodanev <alexey.kodanev@oracle.com> +# Modified: Naresh Kamboju <naresh.kamboju@linaro.org> + +TCID="zram01" +ERR_CODE=0 + +. ./zram_lib.sh + +# Test will create the following number of zram devices: +dev_num=1 +# This is a list of parameters for zram devices. +# Number of items must be equal to 'dev_num' parameter. +zram_max_streams="2" + +# The zram sysfs node 'disksize' value can be either in bytes, +# or you can use mem suffixes. But in some old kernels, mem +# suffixes are not supported, for example, in RHEL6.6GA's kernel +# layer, it uses strict_strtoull() to parse disksize which does +# not support mem suffixes, in some newer kernels, they use +# memparse() which supports mem suffixes. So here we just use +# bytes to make sure everything works correctly. +zram_sizes="2097152" # 2MB +zram_mem_limits="2M" +zram_filesystems="ext4" +zram_algs="lzo" + +zram_fill_fs() +{ + local mem_free0=$(free -m | awk 'NR==2 {print $4}') + + for i in $(seq 0 $(($dev_num - 1))); do + echo "fill zram$i..." + local b=0 + while [ true ]; do + dd conv=notrunc if=/dev/zero of=zram${i}/file \ + oflag=append count=1 bs=1024 status=none \ + > /dev/null 2>&1 || break + b=$(($b + 1)) + done + echo "zram$i can be filled with '$b' KB" + done + + local mem_free1=$(free -m | awk 'NR==2 {print $4}') + local used_mem=$(($mem_free0 - $mem_free1)) + + local total_size=0 + for sm in $zram_sizes; do + local s=$(echo $sm | sed 's/M//') + total_size=$(($total_size + $s)) + done + + echo "zram used ${used_mem}M, zram disk sizes ${total_size}M" + + local v=$((100 * $total_size / $used_mem)) + + if [ "$v" -lt 100 ]; then + echo "FAIL compression ratio: 0.$v:1" + ERR_CODE=-1 + zram_cleanup + return + fi + + echo "zram compression ratio: $(echo "scale=2; $v / 100 " | bc):1: OK" +} + +check_prereqs +zram_load +zram_max_streams +zram_compress_alg +zram_set_disksizes +zram_set_memlimit +zram_makefs +zram_mount + +zram_fill_fs +zram_cleanup +zram_unload + +if [ $ERR_CODE -ne 0 ]; then + echo "$TCID : [FAIL]" +else + echo "$TCID : [PASS]" +fi diff --git a/tools/testing/selftests/zram/zram02.sh b/tools/testing/selftests/zram/zram02.sh new file mode 100755 index 000000000000..74569b883737 --- /dev/null +++ b/tools/testing/selftests/zram/zram02.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# Copyright (c) 2015 Oracle and/or its affiliates. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of +# the License, or (at your option) any later version. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# Test checks that we can create swap zram device. +# +# Author: Alexey Kodanev <alexey.kodanev@oracle.com> +# Modified: Naresh Kamboju <naresh.kamboju@linaro.org> + +TCID="zram02" +ERR_CODE=0 + +. ./zram_lib.sh + +# Test will create the following number of zram devices: +dev_num=1 +# This is a list of parameters for zram devices. +# Number of items must be equal to 'dev_num' parameter. +zram_max_streams="2" + +# The zram sysfs node 'disksize' value can be either in bytes, +# or you can use mem suffixes. But in some old kernels, mem +# suffixes are not supported, for example, in RHEL6.6GA's kernel +# layer, it uses strict_strtoull() to parse disksize which does +# not support mem suffixes, in some newer kernels, they use +# memparse() which supports mem suffixes. So here we just use +# bytes to make sure everything works correctly. +zram_sizes="1048576" # 1M +zram_mem_limits="1M" + +check_prereqs +zram_load +zram_max_streams +zram_set_disksizes +zram_set_memlimit +zram_makeswap +zram_swapoff +zram_cleanup +zram_unload + +if [ $ERR_CODE -ne 0 ]; then + echo "$TCID : [FAIL]" +else + echo "$TCID : [PASS]" +fi diff --git a/tools/testing/selftests/zram/zram_lib.sh b/tools/testing/selftests/zram/zram_lib.sh new file mode 100755 index 000000000000..424e68ed1487 --- /dev/null +++ b/tools/testing/selftests/zram/zram_lib.sh @@ -0,0 +1,232 @@ +#!/bin/sh +# Copyright (c) 2015 Oracle and/or its affiliates. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of +# the License, or (at your option) any later version. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# Author: Alexey Kodanev <alexey.kodanev@oracle.com> +# Modified: Naresh Kamboju <naresh.kamboju@linaro.org> + +MODULE=0 +dev_makeswap=-1 +dev_mounted=-1 + +trap INT + +check_prereqs() +{ + local msg="skip all tests:" + + if [ $UID != 0 ]; then + echo $msg must be run as root >&2 + exit 0 + fi +} + +zram_cleanup() +{ + echo "zram cleanup" + local i= + for i in $(seq 0 $dev_makeswap); do + swapoff /dev/zram$i + done + + for i in $(seq 0 $dev_mounted); do + umount /dev/zram$i + done + + for i in $(seq 0 $(($dev_num - 1))); do + echo 1 > /sys/block/zram${i}/reset + rm -rf zram$i + done + +} + +zram_unload() +{ + if [ $MODULE -ne 0 ] ; then + echo "zram rmmod zram" + rmmod zram > /dev/null 2>&1 + fi +} + +zram_load() +{ + # check zram module exists + MODULE_PATH=/lib/modules/`uname -r`/kernel/drivers/block/zram/zram.ko + if [ -f $MODULE_PATH ]; then + MODULE=1 + echo "create '$dev_num' zram device(s)" + modprobe zram num_devices=$dev_num + if [ $? -ne 0 ]; then + echo "failed to insert zram module" + exit 1 + fi + + dev_num_created=$(ls /dev/zram* | wc -w) + + if [ "$dev_num_created" -ne "$dev_num" ]; then + echo "unexpected num of devices: $dev_num_created" + ERR_CODE=-1 + else + echo "zram load module successful" + fi + elif [ -b /dev/zram0 ]; then + echo "/dev/zram0 device file found: OK" + else + echo "ERROR: No zram.ko module or no /dev/zram0 device found" + echo "$TCID : CONFIG_ZRAM is not set" + exit 1 + fi +} + +zram_max_streams() +{ + echo "set max_comp_streams to zram device(s)" + + local i=0 + for max_s in $zram_max_streams; do + local sys_path="/sys/block/zram${i}/max_comp_streams" + echo $max_s > $sys_path || \ + echo "FAIL failed to set '$max_s' to $sys_path" + sleep 1 + local max_streams=$(cat $sys_path) + + [ "$max_s" -ne "$max_streams" ] && \ + echo "FAIL can't set max_streams '$max_s', get $max_stream" + + i=$(($i + 1)) + echo "$sys_path = '$max_streams' ($i/$dev_num)" + done + + echo "zram max streams: OK" +} + +zram_compress_alg() +{ + echo "test that we can set compression algorithm" + + local algs=$(cat /sys/block/zram0/comp_algorithm) + echo "supported algs: $algs" + local i=0 + for alg in $zram_algs; do + local sys_path="/sys/block/zram${i}/comp_algorithm" + echo "$alg" > $sys_path || \ + echo "FAIL can't set '$alg' to $sys_path" + i=$(($i + 1)) + echo "$sys_path = '$alg' ($i/$dev_num)" + done + + echo "zram set compression algorithm: OK" +} + +zram_set_disksizes() +{ + echo "set disk size to zram device(s)" + local i=0 + for ds in $zram_sizes; do + local sys_path="/sys/block/zram${i}/disksize" + echo "$ds" > $sys_path || \ + echo "FAIL can't set '$ds' to $sys_path" + + i=$(($i + 1)) + echo "$sys_path = '$ds' ($i/$dev_num)" + done + + echo "zram set disksizes: OK" +} + +zram_set_memlimit() +{ + echo "set memory limit to zram device(s)" + + local i=0 + for ds in $zram_mem_limits; do + local sys_path="/sys/block/zram${i}/mem_limit" + echo "$ds" > $sys_path || \ + echo "FAIL can't set '$ds' to $sys_path" + + i=$(($i + 1)) + echo "$sys_path = '$ds' ($i/$dev_num)" + done + + echo "zram set memory limit: OK" +} + +zram_makeswap() +{ + echo "make swap with zram device(s)" + local i=0 + for i in $(seq 0 $(($dev_num - 1))); do + mkswap /dev/zram$i > err.log 2>&1 + if [ $? -ne 0 ]; then + cat err.log + echo "FAIL mkswap /dev/zram$1 failed" + fi + + swapon /dev/zram$i > err.log 2>&1 + if [ $? -ne 0 ]; then + cat err.log + echo "FAIL swapon /dev/zram$1 failed" + fi + + echo "done with /dev/zram$i" + dev_makeswap=$i + done + + echo "zram making zram mkswap and swapon: OK" +} + +zram_swapoff() +{ + local i= + for i in $(seq 0 $dev_makeswap); do + swapoff /dev/zram$i > err.log 2>&1 + if [ $? -ne 0 ]; then + cat err.log + echo "FAIL swapoff /dev/zram$i failed" + fi + done + dev_makeswap=-1 + + echo "zram swapoff: OK" +} + +zram_makefs() +{ + local i=0 + for fs in $zram_filesystems; do + # if requested fs not supported default it to ext2 + which mkfs.$fs > /dev/null 2>&1 || fs=ext2 + + echo "make $fs filesystem on /dev/zram$i" + mkfs.$fs /dev/zram$i > err.log 2>&1 + if [ $? -ne 0 ]; then + cat err.log + echo "FAIL failed to make $fs on /dev/zram$i" + fi + i=$(($i + 1)) + echo "zram mkfs.$fs: OK" + done +} + +zram_mount() +{ + local i=0 + for i in $(seq 0 $(($dev_num - 1))); do + echo "mount /dev/zram$i" + mkdir zram$i + mount /dev/zram$i zram$i > /dev/null || \ + echo "FAIL mount /dev/zram$i failed" + dev_mounted=$i + done + + echo "zram mount of zram device(s): OK" +} |