diff options
Diffstat (limited to 'tools')
40 files changed, 3228 insertions, 167 deletions
diff --git a/tools/Makefile b/tools/Makefile index 6aaeb6cd867d..41067f304215 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -12,6 +12,7 @@ help: @echo ' turbostat - Intel CPU idle stats and freq reporting tool' @echo ' usb - USB testing tools' @echo ' virtio - vhost test module' + @echo ' net - misc networking tools' @echo ' vm - misc vm tools' @echo ' x86_energy_perf_policy - Intel energy policy tool' @echo '' @@ -34,7 +35,7 @@ help: cpupower: FORCE $(call descend,power/$@) -cgroup firewire guest usb virtio vm: FORCE +cgroup firewire guest usb virtio vm net: FORCE $(call descend,$@) liblk: FORCE @@ -52,7 +53,7 @@ turbostat x86_energy_perf_policy: FORCE cpupower_install: $(call descend,power/$(@:_install=),install) -cgroup_install firewire_install lguest_install perf_install usb_install virtio_install vm_install: +cgroup_install firewire_install lguest_install perf_install usb_install virtio_install vm_install net_install: $(call descend,$(@:_install=),install) selftests_install: @@ -63,12 +64,12 @@ turbostat_install x86_energy_perf_policy_install: install: cgroup_install cpupower_install firewire_install lguest_install \ perf_install selftests_install turbostat_install usb_install \ - virtio_install vm_install x86_energy_perf_policy_install + virtio_install vm_install net_install x86_energy_perf_policy_install cpupower_clean: $(call descend,power/cpupower,clean) -cgroup_clean firewire_clean lguest_clean usb_clean virtio_clean vm_clean: +cgroup_clean firewire_clean lguest_clean usb_clean virtio_clean vm_clean net_clean: $(call descend,$(@:_clean=),clean) liblk_clean: @@ -85,6 +86,6 @@ turbostat_clean x86_energy_perf_policy_clean: clean: cgroup_clean cpupower_clean firewire_clean lguest_clean perf_clean \ selftests_clean turbostat_clean usb_clean virtio_clean \ - vm_clean x86_energy_perf_policy_clean + vm_clean net_clean x86_energy_perf_policy_clean .PHONY: FORCE diff --git a/tools/lguest/lguest.txt b/tools/lguest/lguest.txt index 7203ace65e83..06e1f4649511 100644 --- a/tools/lguest/lguest.txt +++ b/tools/lguest/lguest.txt @@ -70,7 +70,7 @@ Running Lguest: - Run an lguest as root: - Documentation/virtual/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \ + tools/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \ --block=rootfile root=/dev/vda Explanation: diff --git a/tools/net/Makefile b/tools/net/Makefile new file mode 100644 index 000000000000..b4444d53b73f --- /dev/null +++ b/tools/net/Makefile @@ -0,0 +1,15 @@ +prefix = /usr + +CC = gcc + +all : bpf_jit_disasm + +bpf_jit_disasm : CFLAGS = -Wall -O2 +bpf_jit_disasm : LDLIBS = -lopcodes -lbfd -ldl +bpf_jit_disasm : bpf_jit_disasm.o + +clean : + rm -rf *.o bpf_jit_disasm + +install : + install bpf_jit_disasm $(prefix)/bin/bpf_jit_disasm diff --git a/tools/net/bpf_jit_disasm.c b/tools/net/bpf_jit_disasm.c new file mode 100644 index 000000000000..cfe0cdcda3de --- /dev/null +++ b/tools/net/bpf_jit_disasm.c @@ -0,0 +1,199 @@ +/* + * Minimal BPF JIT image disassembler + * + * Disassembles BPF JIT compiler emitted opcodes back to asm insn's for + * debugging or verification purposes. + * + * To get the disassembly of the JIT code, do the following: + * + * 1) `echo 2 > /proc/sys/net/core/bpf_jit_enable` + * 2) Load a BPF filter (e.g. `tcpdump -p -n -s 0 -i eth1 host 192.168.20.0/24`) + * 3) Run e.g. `bpf_jit_disasm -o` to read out the last JIT code + * + * Copyright 2013 Daniel Borkmann <borkmann@redhat.com> + * Licensed under the GNU General Public License, version 2.0 (GPLv2) + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> +#include <unistd.h> +#include <string.h> +#include <bfd.h> +#include <dis-asm.h> +#include <sys/klog.h> +#include <sys/types.h> +#include <regex.h> + +static void get_exec_path(char *tpath, size_t size) +{ + char *path; + ssize_t len; + + snprintf(tpath, size, "/proc/%d/exe", (int) getpid()); + tpath[size - 1] = 0; + + path = strdup(tpath); + assert(path); + + len = readlink(path, tpath, size); + tpath[len] = 0; + + free(path); +} + +static void get_asm_insns(uint8_t *image, size_t len, unsigned long base, + int opcodes) +{ + int count, i, pc = 0; + char tpath[256]; + struct disassemble_info info; + disassembler_ftype disassemble; + bfd *bfdf; + + memset(tpath, 0, sizeof(tpath)); + get_exec_path(tpath, sizeof(tpath)); + + bfdf = bfd_openr(tpath, NULL); + assert(bfdf); + assert(bfd_check_format(bfdf, bfd_object)); + + init_disassemble_info(&info, stdout, (fprintf_ftype) fprintf); + info.arch = bfd_get_arch(bfdf); + info.mach = bfd_get_mach(bfdf); + info.buffer = image; + info.buffer_length = len; + + disassemble_init_for_target(&info); + + disassemble = disassembler(bfdf); + assert(disassemble); + + do { + printf("%4x:\t", pc); + + count = disassemble(pc, &info); + + if (opcodes) { + printf("\n\t"); + for (i = 0; i < count; ++i) + printf("%02x ", (uint8_t) image[pc + i]); + } + printf("\n"); + + pc += count; + } while(count > 0 && pc < len); + + bfd_close(bfdf); +} + +static char *get_klog_buff(int *klen) +{ + int ret, len = klogctl(10, NULL, 0); + char *buff = malloc(len); + + assert(buff && klen); + ret = klogctl(3, buff, len); + assert(ret >= 0); + *klen = ret; + + return buff; +} + +static void put_klog_buff(char *buff) +{ + free(buff); +} + +static int get_last_jit_image(char *haystack, size_t hlen, + uint8_t *image, size_t ilen, + unsigned long *base) +{ + char *ptr, *pptr, *tmp; + off_t off = 0; + int ret, flen, proglen, pass, ulen = 0; + regmatch_t pmatch[1]; + regex_t regex; + + if (hlen == 0) + return 0; + + ret = regcomp(®ex, "flen=[[:alnum:]]+ proglen=[[:digit:]]+ " + "pass=[[:digit:]]+ image=[[:xdigit:]]+", REG_EXTENDED); + assert(ret == 0); + + ptr = haystack; + while (1) { + ret = regexec(®ex, ptr, 1, pmatch, 0); + if (ret == 0) { + ptr += pmatch[0].rm_eo; + off += pmatch[0].rm_eo; + assert(off < hlen); + } else + break; + } + + ptr = haystack + off - (pmatch[0].rm_eo - pmatch[0].rm_so); + ret = sscanf(ptr, "flen=%d proglen=%d pass=%d image=%lx", + &flen, &proglen, &pass, base); + if (ret != 4) + return 0; + + tmp = ptr = haystack + off; + while ((ptr = strtok(tmp, "\n")) != NULL && ulen < ilen) { + tmp = NULL; + if (!strstr(ptr, "JIT code")) + continue; + pptr = ptr; + while ((ptr = strstr(pptr, ":"))) + pptr = ptr + 1; + ptr = pptr; + do { + image[ulen++] = (uint8_t) strtoul(pptr, &pptr, 16); + if (ptr == pptr || ulen >= ilen) { + ulen--; + break; + } + ptr = pptr; + } while (1); + } + + assert(ulen == proglen); + printf("%d bytes emitted from JIT compiler (pass:%d, flen:%d)\n", + proglen, pass, flen); + printf("%lx + <x>:\n", *base); + + regfree(®ex); + return ulen; +} + +int main(int argc, char **argv) +{ + int len, klen, opcodes = 0; + char *kbuff; + unsigned long base; + uint8_t image[4096]; + + if (argc > 1) { + if (!strncmp("-o", argv[argc - 1], 2)) { + opcodes = 1; + } else { + printf("usage: bpf_jit_disasm [-o: show opcodes]\n"); + exit(0); + } + } + + bfd_init(); + memset(image, 0, sizeof(image)); + + kbuff = get_klog_buff(&klen); + + len = get_last_jit_image(kbuff, klen, image, sizeof(image), &base); + if (len > 0 && base > 0) + get_asm_insns(image, len, base, opcodes); + + put_klog_buff(kbuff); + + return 0; +} diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 3cc0ad7ae863..d4abc59ce1d9 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -1,10 +1,13 @@ TARGETS = breakpoints +TARGETS += cpu-hotplug +TARGETS += efivarfs TARGETS += kcmp +TARGETS += memory-hotplug TARGETS += mqueue +TARGETS += net +TARGETS += ptrace +TARGETS += soft-dirty TARGETS += vm -TARGETS += cpu-hotplug -TARGETS += memory-hotplug -TARGETS += efivarfs all: for TARGET in $(TARGETS); do \ diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore new file mode 100644 index 000000000000..00326629d4af --- /dev/null +++ b/tools/testing/selftests/net/.gitignore @@ -0,0 +1,3 @@ +socket +psock_fanout +psock_tpacket diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile new file mode 100644 index 000000000000..750512ba2c88 --- /dev/null +++ b/tools/testing/selftests/net/Makefile @@ -0,0 +1,19 @@ +# Makefile for net selftests + +CC = $(CROSS_COMPILE)gcc +CFLAGS = -Wall -O2 -g + +CFLAGS += -I../../../../usr/include/ + +NET_PROGS = socket psock_fanout psock_tpacket + +all: $(NET_PROGS) +%: %.c + $(CC) $(CFLAGS) -o $@ $^ + +run_tests: all + @/bin/sh ./run_netsocktests || echo "sockettests: [FAIL]" + @/bin/sh ./run_afpackettests || echo "afpackettests: [FAIL]" + +clean: + $(RM) $(NET_PROGS) diff --git a/tools/testing/selftests/net/psock_fanout.c b/tools/testing/selftests/net/psock_fanout.c new file mode 100644 index 000000000000..57b9c2b7c4ff --- /dev/null +++ b/tools/testing/selftests/net/psock_fanout.c @@ -0,0 +1,312 @@ +/* + * Copyright 2013 Google Inc. + * Author: Willem de Bruijn (willemb@google.com) + * + * A basic test of packet socket fanout behavior. + * + * Control: + * - create fanout fails as expected with illegal flag combinations + * - join fanout fails as expected with diverging types or flags + * + * Datapath: + * Open a pair of packet sockets and a pair of INET sockets, send a known + * number of packets across the two INET sockets and count the number of + * packets enqueued onto the two packet sockets. + * + * The test currently runs for + * - PACKET_FANOUT_HASH + * - PACKET_FANOUT_HASH with PACKET_FANOUT_FLAG_ROLLOVER + * - PACKET_FANOUT_LB + * - PACKET_FANOUT_CPU + * - PACKET_FANOUT_ROLLOVER + * + * Todo: + * - functionality: PACKET_FANOUT_FLAG_DEFRAG + * + * License (GPLv2): + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#define _GNU_SOURCE /* for sched_setaffinity */ + +#include <arpa/inet.h> +#include <errno.h> +#include <fcntl.h> +#include <linux/filter.h> +#include <linux/if_packet.h> +#include <net/ethernet.h> +#include <netinet/ip.h> +#include <netinet/udp.h> +#include <poll.h> +#include <sched.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> + +#include "psock_lib.h" + +#define RING_NUM_FRAMES 20 + +/* Open a socket in a given fanout mode. + * @return -1 if mode is bad, a valid socket otherwise */ +static int sock_fanout_open(uint16_t typeflags, int num_packets) +{ + int fd, val; + + fd = socket(PF_PACKET, SOCK_DGRAM, htons(ETH_P_IP)); + if (fd < 0) { + perror("socket packet"); + exit(1); + } + + /* fanout group ID is always 0: tests whether old groups are deleted */ + val = ((int) typeflags) << 16; + if (setsockopt(fd, SOL_PACKET, PACKET_FANOUT, &val, sizeof(val))) { + if (close(fd)) { + perror("close packet"); + exit(1); + } + return -1; + } + + pair_udp_setfilter(fd); + return fd; +} + +static char *sock_fanout_open_ring(int fd) +{ + struct tpacket_req req = { + .tp_block_size = getpagesize(), + .tp_frame_size = getpagesize(), + .tp_block_nr = RING_NUM_FRAMES, + .tp_frame_nr = RING_NUM_FRAMES, + }; + char *ring; + int val = TPACKET_V2; + + if (setsockopt(fd, SOL_PACKET, PACKET_VERSION, (void *) &val, + sizeof(val))) { + perror("packetsock ring setsockopt version"); + exit(1); + } + if (setsockopt(fd, SOL_PACKET, PACKET_RX_RING, (void *) &req, + sizeof(req))) { + perror("packetsock ring setsockopt"); + exit(1); + } + + ring = mmap(0, req.tp_block_size * req.tp_block_nr, + PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (!ring) { + fprintf(stderr, "packetsock ring mmap\n"); + exit(1); + } + + return ring; +} + +static int sock_fanout_read_ring(int fd, void *ring) +{ + struct tpacket2_hdr *header = ring; + int count = 0; + + while (header->tp_status & TP_STATUS_USER && count < RING_NUM_FRAMES) { + count++; + header = ring + (count * getpagesize()); + } + + return count; +} + +static int sock_fanout_read(int fds[], char *rings[], const int expect[]) +{ + int ret[2]; + + ret[0] = sock_fanout_read_ring(fds[0], rings[0]); + ret[1] = sock_fanout_read_ring(fds[1], rings[1]); + + fprintf(stderr, "info: count=%d,%d, expect=%d,%d\n", + ret[0], ret[1], expect[0], expect[1]); + + if ((!(ret[0] == expect[0] && ret[1] == expect[1])) && + (!(ret[0] == expect[1] && ret[1] == expect[0]))) { + fprintf(stderr, "ERROR: incorrect queue lengths\n"); + return 1; + } + + return 0; +} + +/* Test illegal mode + flag combination */ +static void test_control_single(void) +{ + fprintf(stderr, "test: control single socket\n"); + + if (sock_fanout_open(PACKET_FANOUT_ROLLOVER | + PACKET_FANOUT_FLAG_ROLLOVER, 0) != -1) { + fprintf(stderr, "ERROR: opened socket with dual rollover\n"); + exit(1); + } +} + +/* Test illegal group with different modes or flags */ +static void test_control_group(void) +{ + int fds[2]; + + fprintf(stderr, "test: control multiple sockets\n"); + + fds[0] = sock_fanout_open(PACKET_FANOUT_HASH, 20); + if (fds[0] == -1) { + fprintf(stderr, "ERROR: failed to open HASH socket\n"); + exit(1); + } + if (sock_fanout_open(PACKET_FANOUT_HASH | + PACKET_FANOUT_FLAG_DEFRAG, 10) != -1) { + fprintf(stderr, "ERROR: joined group with wrong flag defrag\n"); + exit(1); + } + if (sock_fanout_open(PACKET_FANOUT_HASH | + PACKET_FANOUT_FLAG_ROLLOVER, 10) != -1) { + fprintf(stderr, "ERROR: joined group with wrong flag ro\n"); + exit(1); + } + if (sock_fanout_open(PACKET_FANOUT_CPU, 10) != -1) { + fprintf(stderr, "ERROR: joined group with wrong mode\n"); + exit(1); + } + fds[1] = sock_fanout_open(PACKET_FANOUT_HASH, 20); + if (fds[1] == -1) { + fprintf(stderr, "ERROR: failed to join group\n"); + exit(1); + } + if (close(fds[1]) || close(fds[0])) { + fprintf(stderr, "ERROR: closing sockets\n"); + exit(1); + } +} + +static int test_datapath(uint16_t typeflags, int port_off, + const int expect1[], const int expect2[]) +{ + const int expect0[] = { 0, 0 }; + char *rings[2]; + int fds[2], fds_udp[2][2], ret; + + fprintf(stderr, "test: datapath 0x%hx\n", typeflags); + + fds[0] = sock_fanout_open(typeflags, 20); + fds[1] = sock_fanout_open(typeflags, 20); + if (fds[0] == -1 || fds[1] == -1) { + fprintf(stderr, "ERROR: failed open\n"); + exit(1); + } + rings[0] = sock_fanout_open_ring(fds[0]); + rings[1] = sock_fanout_open_ring(fds[1]); + pair_udp_open(fds_udp[0], PORT_BASE); + pair_udp_open(fds_udp[1], PORT_BASE + port_off); + sock_fanout_read(fds, rings, expect0); + + /* Send data, but not enough to overflow a queue */ + pair_udp_send(fds_udp[0], 15); + pair_udp_send(fds_udp[1], 5); + ret = sock_fanout_read(fds, rings, expect1); + + /* Send more data, overflow the queue */ + pair_udp_send(fds_udp[0], 15); + /* TODO: ensure consistent order between expect1 and expect2 */ + ret |= sock_fanout_read(fds, rings, expect2); + + if (munmap(rings[1], RING_NUM_FRAMES * getpagesize()) || + munmap(rings[0], RING_NUM_FRAMES * getpagesize())) { + fprintf(stderr, "close rings\n"); + exit(1); + } + if (close(fds_udp[1][1]) || close(fds_udp[1][0]) || + close(fds_udp[0][1]) || close(fds_udp[0][0]) || + close(fds[1]) || close(fds[0])) { + fprintf(stderr, "close datapath\n"); + exit(1); + } + + return ret; +} + +static int set_cpuaffinity(int cpuid) +{ + cpu_set_t mask; + + CPU_ZERO(&mask); + CPU_SET(cpuid, &mask); + if (sched_setaffinity(0, sizeof(mask), &mask)) { + if (errno != EINVAL) { + fprintf(stderr, "setaffinity %d\n", cpuid); + exit(1); + } + return 1; + } + + return 0; +} + +int main(int argc, char **argv) +{ + const int expect_hash[2][2] = { { 15, 5 }, { 20, 5 } }; + const int expect_hash_rb[2][2] = { { 15, 5 }, { 20, 15 } }; + const int expect_lb[2][2] = { { 10, 10 }, { 18, 17 } }; + const int expect_rb[2][2] = { { 20, 0 }, { 20, 15 } }; + const int expect_cpu0[2][2] = { { 20, 0 }, { 20, 0 } }; + const int expect_cpu1[2][2] = { { 0, 20 }, { 0, 20 } }; + int port_off = 2, tries = 5, ret; + + test_control_single(); + test_control_group(); + + /* find a set of ports that do not collide onto the same socket */ + ret = test_datapath(PACKET_FANOUT_HASH, port_off, + expect_hash[0], expect_hash[1]); + while (ret && tries--) { + fprintf(stderr, "info: trying alternate ports (%d)\n", tries); + ret = test_datapath(PACKET_FANOUT_HASH, ++port_off, + expect_hash[0], expect_hash[1]); + } + + ret |= test_datapath(PACKET_FANOUT_HASH | PACKET_FANOUT_FLAG_ROLLOVER, + port_off, expect_hash_rb[0], expect_hash_rb[1]); + ret |= test_datapath(PACKET_FANOUT_LB, + port_off, expect_lb[0], expect_lb[1]); + ret |= test_datapath(PACKET_FANOUT_ROLLOVER, + port_off, expect_rb[0], expect_rb[1]); + + set_cpuaffinity(0); + ret |= test_datapath(PACKET_FANOUT_CPU, port_off, + expect_cpu0[0], expect_cpu0[1]); + if (!set_cpuaffinity(1)) + /* TODO: test that choice alternates with previous */ + ret |= test_datapath(PACKET_FANOUT_CPU, port_off, + expect_cpu1[0], expect_cpu1[1]); + + if (ret) + return 1; + + printf("OK. All tests passed\n"); + return 0; +} diff --git a/tools/testing/selftests/net/psock_lib.h b/tools/testing/selftests/net/psock_lib.h new file mode 100644 index 000000000000..37da54ac85a9 --- /dev/null +++ b/tools/testing/selftests/net/psock_lib.h @@ -0,0 +1,127 @@ +/* + * Copyright 2013 Google Inc. + * Author: Willem de Bruijn <willemb@google.com> + * Daniel Borkmann <dborkman@redhat.com> + * + * License (GPLv2): + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef PSOCK_LIB_H +#define PSOCK_LIB_H + +#include <sys/types.h> +#include <sys/socket.h> +#include <string.h> +#include <arpa/inet.h> +#include <unistd.h> + +#define DATA_LEN 100 +#define DATA_CHAR 'a' + +#define PORT_BASE 8000 + +#ifndef __maybe_unused +# define __maybe_unused __attribute__ ((__unused__)) +#endif + +static __maybe_unused void pair_udp_setfilter(int fd) +{ + struct sock_filter bpf_filter[] = { + { 0x80, 0, 0, 0x00000000 }, /* LD pktlen */ + { 0x35, 0, 5, DATA_LEN }, /* JGE DATA_LEN [f goto nomatch]*/ + { 0x30, 0, 0, 0x00000050 }, /* LD ip[80] */ + { 0x15, 0, 3, DATA_CHAR }, /* JEQ DATA_CHAR [f goto nomatch]*/ + { 0x30, 0, 0, 0x00000051 }, /* LD ip[81] */ + { 0x15, 0, 1, DATA_CHAR }, /* JEQ DATA_CHAR [f goto nomatch]*/ + { 0x06, 0, 0, 0x00000060 }, /* RET match */ + { 0x06, 0, 0, 0x00000000 }, /* RET no match */ + }; + struct sock_fprog bpf_prog; + + bpf_prog.filter = bpf_filter; + bpf_prog.len = sizeof(bpf_filter) / sizeof(struct sock_filter); + if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &bpf_prog, + sizeof(bpf_prog))) { + perror("setsockopt SO_ATTACH_FILTER"); + exit(1); + } +} + +static __maybe_unused void pair_udp_open(int fds[], uint16_t port) +{ + struct sockaddr_in saddr, daddr; + + fds[0] = socket(PF_INET, SOCK_DGRAM, 0); + fds[1] = socket(PF_INET, SOCK_DGRAM, 0); + if (fds[0] == -1 || fds[1] == -1) { + fprintf(stderr, "ERROR: socket dgram\n"); + exit(1); + } + + memset(&saddr, 0, sizeof(saddr)); + saddr.sin_family = AF_INET; + saddr.sin_port = htons(port); + saddr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + + memset(&daddr, 0, sizeof(daddr)); + daddr.sin_family = AF_INET; + daddr.sin_port = htons(port + 1); + daddr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + + /* must bind both to get consistent hash result */ + if (bind(fds[1], (void *) &daddr, sizeof(daddr))) { + perror("bind"); + exit(1); + } + if (bind(fds[0], (void *) &saddr, sizeof(saddr))) { + perror("bind"); + exit(1); + } + if (connect(fds[0], (void *) &daddr, sizeof(daddr))) { + perror("connect"); + exit(1); + } +} + +static __maybe_unused void pair_udp_send(int fds[], int num) +{ + char buf[DATA_LEN], rbuf[DATA_LEN]; + + memset(buf, DATA_CHAR, sizeof(buf)); + while (num--) { + /* Should really handle EINTR and EAGAIN */ + if (write(fds[0], buf, sizeof(buf)) != sizeof(buf)) { + fprintf(stderr, "ERROR: send failed left=%d\n", num); + exit(1); + } + if (read(fds[1], rbuf, sizeof(rbuf)) != sizeof(rbuf)) { + fprintf(stderr, "ERROR: recv failed left=%d\n", num); + exit(1); + } + if (memcmp(buf, rbuf, sizeof(buf))) { + fprintf(stderr, "ERROR: data failed left=%d\n", num); + exit(1); + } + } +} + +static __maybe_unused void pair_udp_close(int fds[]) +{ + close(fds[0]); + close(fds[1]); +} + +#endif /* PSOCK_LIB_H */ diff --git a/tools/testing/selftests/net/psock_tpacket.c b/tools/testing/selftests/net/psock_tpacket.c new file mode 100644 index 000000000000..c41b58640a05 --- /dev/null +++ b/tools/testing/selftests/net/psock_tpacket.c @@ -0,0 +1,824 @@ +/* + * Copyright 2013 Red Hat, Inc. + * Author: Daniel Borkmann <dborkman@redhat.com> + * + * A basic test of packet socket's TPACKET_V1/TPACKET_V2/TPACKET_V3 behavior. + * + * Control: + * Test the setup of the TPACKET socket with different patterns that are + * known to fail (TODO) resp. succeed (OK). + * + * Datapath: + * Open a pair of packet sockets and send resp. receive an a priori known + * packet pattern accross the sockets and check if it was received resp. + * sent correctly. Fanout in combination with RX_RING is currently not + * tested here. + * + * The test currently runs for + * - TPACKET_V1: RX_RING, TX_RING + * - TPACKET_V2: RX_RING, TX_RING + * - TPACKET_V3: RX_RING + * + * License (GPLv2): + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/socket.h> +#include <sys/mman.h> +#include <linux/if_packet.h> +#include <linux/filter.h> +#include <ctype.h> +#include <fcntl.h> +#include <unistd.h> +#include <bits/wordsize.h> +#include <net/ethernet.h> +#include <netinet/ip.h> +#include <arpa/inet.h> +#include <stdint.h> +#include <string.h> +#include <assert.h> +#include <net/if.h> +#include <inttypes.h> +#include <poll.h> + +#include "psock_lib.h" + +#ifndef bug_on +# define bug_on(cond) assert(!(cond)) +#endif + +#ifndef __aligned_tpacket +# define __aligned_tpacket __attribute__((aligned(TPACKET_ALIGNMENT))) +#endif + +#ifndef __align_tpacket +# define __align_tpacket(x) __attribute__((aligned(TPACKET_ALIGN(x)))) +#endif + +#define BLOCK_STATUS(x) ((x)->h1.block_status) +#define BLOCK_NUM_PKTS(x) ((x)->h1.num_pkts) +#define BLOCK_O2FP(x) ((x)->h1.offset_to_first_pkt) +#define BLOCK_LEN(x) ((x)->h1.blk_len) +#define BLOCK_SNUM(x) ((x)->h1.seq_num) +#define BLOCK_O2PRIV(x) ((x)->offset_to_priv) +#define BLOCK_PRIV(x) ((void *) ((uint8_t *) (x) + BLOCK_O2PRIV(x))) +#define BLOCK_HDR_LEN (ALIGN_8(sizeof(struct block_desc))) +#define ALIGN_8(x) (((x) + 8 - 1) & ~(8 - 1)) +#define BLOCK_PLUS_PRIV(sz_pri) (BLOCK_HDR_LEN + ALIGN_8((sz_pri))) + +#define NUM_PACKETS 100 + +struct ring { + struct iovec *rd; + uint8_t *mm_space; + size_t mm_len, rd_len; + struct sockaddr_ll ll; + void (*walk)(int sock, struct ring *ring); + int type, rd_num, flen, version; + union { + struct tpacket_req req; + struct tpacket_req3 req3; + }; +}; + +struct block_desc { + uint32_t version; + uint32_t offset_to_priv; + struct tpacket_hdr_v1 h1; +}; + +union frame_map { + struct { + struct tpacket_hdr tp_h __aligned_tpacket; + struct sockaddr_ll s_ll __align_tpacket(sizeof(struct tpacket_hdr)); + } *v1; + struct { + struct tpacket2_hdr tp_h __aligned_tpacket; + struct sockaddr_ll s_ll __align_tpacket(sizeof(struct tpacket2_hdr)); + } *v2; + void *raw; +}; + +static unsigned int total_packets, total_bytes; + +static int pfsocket(int ver) +{ + int ret, sock = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); + if (sock == -1) { + perror("socket"); + exit(1); + } + + ret = setsockopt(sock, SOL_PACKET, PACKET_VERSION, &ver, sizeof(ver)); + if (ret == -1) { + perror("setsockopt"); + exit(1); + } + + return sock; +} + +static void status_bar_update(void) +{ + if (total_packets % 10 == 0) { + fprintf(stderr, "."); + fflush(stderr); + } +} + +static void test_payload(void *pay, size_t len) +{ + struct ethhdr *eth = pay; + + if (len < sizeof(struct ethhdr)) { + fprintf(stderr, "test_payload: packet too " + "small: %zu bytes!\n", len); + exit(1); + } + + if (eth->h_proto != htons(ETH_P_IP)) { + fprintf(stderr, "test_payload: wrong ethernet " + "type: 0x%x!\n", ntohs(eth->h_proto)); + exit(1); + } +} + +static void create_payload(void *pay, size_t *len) +{ + int i; + struct ethhdr *eth = pay; + struct iphdr *ip = pay + sizeof(*eth); + + /* Lets create some broken crap, that still passes + * our BPF filter. + */ + + *len = DATA_LEN + 42; + + memset(pay, 0xff, ETH_ALEN * 2); + eth->h_proto = htons(ETH_P_IP); + + for (i = 0; i < sizeof(*ip); ++i) + ((uint8_t *) pay)[i + sizeof(*eth)] = (uint8_t) rand(); + + ip->ihl = 5; + ip->version = 4; + ip->protocol = 0x11; + ip->frag_off = 0; + ip->ttl = 64; + ip->tot_len = htons((uint16_t) *len - sizeof(*eth)); + + ip->saddr = htonl(INADDR_LOOPBACK); + ip->daddr = htonl(INADDR_LOOPBACK); + + memset(pay + sizeof(*eth) + sizeof(*ip), + DATA_CHAR, DATA_LEN); +} + +static inline int __v1_rx_kernel_ready(struct tpacket_hdr *hdr) +{ + return ((hdr->tp_status & TP_STATUS_USER) == TP_STATUS_USER); +} + +static inline void __v1_rx_user_ready(struct tpacket_hdr *hdr) +{ + hdr->tp_status = TP_STATUS_KERNEL; + __sync_synchronize(); +} + +static inline int __v2_rx_kernel_ready(struct tpacket2_hdr *hdr) +{ + return ((hdr->tp_status & TP_STATUS_USER) == TP_STATUS_USER); +} + +static inline void __v2_rx_user_ready(struct tpacket2_hdr *hdr) +{ + hdr->tp_status = TP_STATUS_KERNEL; + __sync_synchronize(); +} + +static inline int __v1_v2_rx_kernel_ready(void *base, int version) +{ + switch (version) { + case TPACKET_V1: + return __v1_rx_kernel_ready(base); + case TPACKET_V2: + return __v2_rx_kernel_ready(base); + default: + bug_on(1); + return 0; + } +} + +static inline void __v1_v2_rx_user_ready(void *base, int version) +{ + switch (version) { + case TPACKET_V1: + __v1_rx_user_ready(base); + break; + case TPACKET_V2: + __v2_rx_user_ready(base); + break; + } +} + +static void walk_v1_v2_rx(int sock, struct ring *ring) +{ + struct pollfd pfd; + int udp_sock[2]; + union frame_map ppd; + unsigned int frame_num = 0; + + bug_on(ring->type != PACKET_RX_RING); + + pair_udp_open(udp_sock, PORT_BASE); + pair_udp_setfilter(sock); + + memset(&pfd, 0, sizeof(pfd)); + pfd.fd = sock; + pfd.events = POLLIN | POLLERR; + pfd.revents = 0; + + pair_udp_send(udp_sock, NUM_PACKETS); + + while (total_packets < NUM_PACKETS * 2) { + while (__v1_v2_rx_kernel_ready(ring->rd[frame_num].iov_base, + ring->version)) { + ppd.raw = ring->rd[frame_num].iov_base; + + switch (ring->version) { + case TPACKET_V1: + test_payload((uint8_t *) ppd.raw + ppd.v1->tp_h.tp_mac, + ppd.v1->tp_h.tp_snaplen); + total_bytes += ppd.v1->tp_h.tp_snaplen; + break; + + case TPACKET_V2: + test_payload((uint8_t *) ppd.raw + ppd.v2->tp_h.tp_mac, + ppd.v2->tp_h.tp_snaplen); + total_bytes += ppd.v2->tp_h.tp_snaplen; + break; + } + + status_bar_update(); + total_packets++; + + __v1_v2_rx_user_ready(ppd.raw, ring->version); + + frame_num = (frame_num + 1) % ring->rd_num; + } + + poll(&pfd, 1, 1); + } + + pair_udp_close(udp_sock); + + if (total_packets != 2 * NUM_PACKETS) { + fprintf(stderr, "walk_v%d_rx: received %u out of %u pkts\n", + ring->version, total_packets, NUM_PACKETS); + exit(1); + } + + fprintf(stderr, " %u pkts (%u bytes)", NUM_PACKETS, total_bytes >> 1); +} + +static inline int __v1_tx_kernel_ready(struct tpacket_hdr *hdr) +{ + return !(hdr->tp_status & (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING)); +} + +static inline void __v1_tx_user_ready(struct tpacket_hdr *hdr) +{ + hdr->tp_status = TP_STATUS_SEND_REQUEST; + __sync_synchronize(); +} + +static inline int __v2_tx_kernel_ready(struct tpacket2_hdr *hdr) +{ + return !(hdr->tp_status & (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING)); +} + +static inline void __v2_tx_user_ready(struct tpacket2_hdr *hdr) +{ + hdr->tp_status = TP_STATUS_SEND_REQUEST; + __sync_synchronize(); +} + +static inline int __v1_v2_tx_kernel_ready(void *base, int version) +{ + switch (version) { + case TPACKET_V1: + return __v1_tx_kernel_ready(base); + case TPACKET_V2: + return __v2_tx_kernel_ready(base); + default: + bug_on(1); + return 0; + } +} + +static inline void __v1_v2_tx_user_ready(void *base, int version) +{ + switch (version) { + case TPACKET_V1: + __v1_tx_user_ready(base); + break; + case TPACKET_V2: + __v2_tx_user_ready(base); + break; + } +} + +static void __v1_v2_set_packet_loss_discard(int sock) +{ + int ret, discard = 1; + + ret = setsockopt(sock, SOL_PACKET, PACKET_LOSS, (void *) &discard, + sizeof(discard)); + if (ret == -1) { + perror("setsockopt"); + exit(1); + } +} + +static void walk_v1_v2_tx(int sock, struct ring *ring) +{ + struct pollfd pfd; + int rcv_sock, ret; + size_t packet_len; + union frame_map ppd; + char packet[1024]; + unsigned int frame_num = 0, got = 0; + struct sockaddr_ll ll = { + .sll_family = PF_PACKET, + .sll_halen = ETH_ALEN, + }; + + bug_on(ring->type != PACKET_TX_RING); + bug_on(ring->rd_num < NUM_PACKETS); + + rcv_sock = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); + if (rcv_sock == -1) { + perror("socket"); + exit(1); + } + + pair_udp_setfilter(rcv_sock); + + ll.sll_ifindex = if_nametoindex("lo"); + ret = bind(rcv_sock, (struct sockaddr *) &ll, sizeof(ll)); + if (ret == -1) { + perror("bind"); + exit(1); + } + + memset(&pfd, 0, sizeof(pfd)); + pfd.fd = sock; + pfd.events = POLLOUT | POLLERR; + pfd.revents = 0; + + total_packets = NUM_PACKETS; + create_payload(packet, &packet_len); + + while (total_packets > 0) { + while (__v1_v2_tx_kernel_ready(ring->rd[frame_num].iov_base, + ring->version) && + total_packets > 0) { + ppd.raw = ring->rd[frame_num].iov_base; + + switch (ring->version) { + case TPACKET_V1: + ppd.v1->tp_h.tp_snaplen = packet_len; + ppd.v1->tp_h.tp_len = packet_len; + + memcpy((uint8_t *) ppd.raw + TPACKET_HDRLEN - + sizeof(struct sockaddr_ll), packet, + packet_len); + total_bytes += ppd.v1->tp_h.tp_snaplen; + break; + + case TPACKET_V2: + ppd.v2->tp_h.tp_snaplen = packet_len; + ppd.v2->tp_h.tp_len = packet_len; + + memcpy((uint8_t *) ppd.raw + TPACKET2_HDRLEN - + sizeof(struct sockaddr_ll), packet, + packet_len); + total_bytes += ppd.v2->tp_h.tp_snaplen; + break; + } + + status_bar_update(); + total_packets--; + + __v1_v2_tx_user_ready(ppd.raw, ring->version); + + frame_num = (frame_num + 1) % ring->rd_num; + } + + poll(&pfd, 1, 1); + } + + bug_on(total_packets != 0); + + ret = sendto(sock, NULL, 0, 0, NULL, 0); + if (ret == -1) { + perror("sendto"); + exit(1); + } + + while ((ret = recvfrom(rcv_sock, packet, sizeof(packet), + 0, NULL, NULL)) > 0 && + total_packets < NUM_PACKETS) { + got += ret; + test_payload(packet, ret); + + status_bar_update(); + total_packets++; + } + + close(rcv_sock); + + if (total_packets != NUM_PACKETS) { + fprintf(stderr, "walk_v%d_rx: received %u out of %u pkts\n", + ring->version, total_packets, NUM_PACKETS); + exit(1); + } + + fprintf(stderr, " %u pkts (%u bytes)", NUM_PACKETS, got); +} + +static void walk_v1_v2(int sock, struct ring *ring) +{ + if (ring->type == PACKET_RX_RING) + walk_v1_v2_rx(sock, ring); + else + walk_v1_v2_tx(sock, ring); +} + +static uint64_t __v3_prev_block_seq_num = 0; + +void __v3_test_block_seq_num(struct block_desc *pbd) +{ + if (__v3_prev_block_seq_num + 1 != BLOCK_SNUM(pbd)) { + fprintf(stderr, "\nprev_block_seq_num:%"PRIu64", expected " + "seq:%"PRIu64" != actual seq:%"PRIu64"\n", + __v3_prev_block_seq_num, __v3_prev_block_seq_num + 1, + (uint64_t) BLOCK_SNUM(pbd)); + exit(1); + } + + __v3_prev_block_seq_num = BLOCK_SNUM(pbd); +} + +static void __v3_test_block_len(struct block_desc *pbd, uint32_t bytes, int block_num) +{ + if (BLOCK_NUM_PKTS(pbd)) { + if (bytes != BLOCK_LEN(pbd)) { + fprintf(stderr, "\nblock:%u with %upackets, expected " + "len:%u != actual len:%u\n", block_num, + BLOCK_NUM_PKTS(pbd), bytes, BLOCK_LEN(pbd)); + exit(1); + } + } else { + if (BLOCK_LEN(pbd) != BLOCK_PLUS_PRIV(13)) { + fprintf(stderr, "\nblock:%u, expected len:%lu != " + "actual len:%u\n", block_num, BLOCK_HDR_LEN, + BLOCK_LEN(pbd)); + exit(1); + } + } +} + +static void __v3_test_block_header(struct block_desc *pbd, const int block_num) +{ + uint32_t block_status = BLOCK_STATUS(pbd); + + if ((block_status & TP_STATUS_USER) == 0) { + fprintf(stderr, "\nblock %u: not in TP_STATUS_USER\n", block_num); + exit(1); + } + + __v3_test_block_seq_num(pbd); +} + +static void __v3_walk_block(struct block_desc *pbd, const int block_num) +{ + int num_pkts = BLOCK_NUM_PKTS(pbd), i; + unsigned long bytes = 0; + unsigned long bytes_with_padding = BLOCK_PLUS_PRIV(13); + struct tpacket3_hdr *ppd; + + __v3_test_block_header(pbd, block_num); + + ppd = (struct tpacket3_hdr *) ((uint8_t *) pbd + BLOCK_O2FP(pbd)); + for (i = 0; i < num_pkts; ++i) { + bytes += ppd->tp_snaplen; + + if (ppd->tp_next_offset) + bytes_with_padding += ppd->tp_next_offset; + else + bytes_with_padding += ALIGN_8(ppd->tp_snaplen + ppd->tp_mac); + + test_payload((uint8_t *) ppd + ppd->tp_mac, ppd->tp_snaplen); + + status_bar_update(); + total_packets++; + + ppd = (struct tpacket3_hdr *) ((uint8_t *) ppd + ppd->tp_next_offset); + __sync_synchronize(); + } + + __v3_test_block_len(pbd, bytes_with_padding, block_num); + total_bytes += bytes; +} + +void __v3_flush_block(struct block_desc *pbd) +{ + BLOCK_STATUS(pbd) = TP_STATUS_KERNEL; + __sync_synchronize(); +} + +static void walk_v3_rx(int sock, struct ring *ring) +{ + unsigned int block_num = 0; + struct pollfd pfd; + struct block_desc *pbd; + int udp_sock[2]; + + bug_on(ring->type != PACKET_RX_RING); + + pair_udp_open(udp_sock, PORT_BASE); + pair_udp_setfilter(sock); + + memset(&pfd, 0, sizeof(pfd)); + pfd.fd = sock; + pfd.events = POLLIN | POLLERR; + pfd.revents = 0; + + pair_udp_send(udp_sock, NUM_PACKETS); + + while (total_packets < NUM_PACKETS * 2) { + pbd = (struct block_desc *) ring->rd[block_num].iov_base; + + while ((BLOCK_STATUS(pbd) & TP_STATUS_USER) == 0) + poll(&pfd, 1, 1); + + __v3_walk_block(pbd, block_num); + __v3_flush_block(pbd); + + block_num = (block_num + 1) % ring->rd_num; + } + + pair_udp_close(udp_sock); + + if (total_packets != 2 * NUM_PACKETS) { + fprintf(stderr, "walk_v3_rx: received %u out of %u pkts\n", + total_packets, NUM_PACKETS); + exit(1); + } + + fprintf(stderr, " %u pkts (%u bytes)", NUM_PACKETS, total_bytes >> 1); +} + +static void walk_v3(int sock, struct ring *ring) +{ + if (ring->type == PACKET_RX_RING) + walk_v3_rx(sock, ring); + else + bug_on(1); +} + +static void __v1_v2_fill(struct ring *ring, unsigned int blocks) +{ + ring->req.tp_block_size = getpagesize() << 2; + ring->req.tp_frame_size = TPACKET_ALIGNMENT << 7; + ring->req.tp_block_nr = blocks; + + ring->req.tp_frame_nr = ring->req.tp_block_size / + ring->req.tp_frame_size * + ring->req.tp_block_nr; + + ring->mm_len = ring->req.tp_block_size * ring->req.tp_block_nr; + ring->walk = walk_v1_v2; + ring->rd_num = ring->req.tp_frame_nr; + ring->flen = ring->req.tp_frame_size; +} + +static void __v3_fill(struct ring *ring, unsigned int blocks) +{ + ring->req3.tp_retire_blk_tov = 64; + ring->req3.tp_sizeof_priv = 13; + ring->req3.tp_feature_req_word |= TP_FT_REQ_FILL_RXHASH; + + ring->req3.tp_block_size = getpagesize() << 2; + ring->req3.tp_frame_size = TPACKET_ALIGNMENT << 7; + ring->req3.tp_block_nr = blocks; + + ring->req3.tp_frame_nr = ring->req3.tp_block_size / + ring->req3.tp_frame_size * + ring->req3.tp_block_nr; + + ring->mm_len = ring->req3.tp_block_size * ring->req3.tp_block_nr; + ring->walk = walk_v3; + ring->rd_num = ring->req3.tp_block_nr; + ring->flen = ring->req3.tp_block_size; +} + +static void setup_ring(int sock, struct ring *ring, int version, int type) +{ + int ret = 0; + unsigned int blocks = 256; + + ring->type = type; + ring->version = version; + + switch (version) { + case TPACKET_V1: + case TPACKET_V2: + if (type == PACKET_TX_RING) + __v1_v2_set_packet_loss_discard(sock); + __v1_v2_fill(ring, blocks); + ret = setsockopt(sock, SOL_PACKET, type, &ring->req, + sizeof(ring->req)); + break; + + case TPACKET_V3: + __v3_fill(ring, blocks); + ret = setsockopt(sock, SOL_PACKET, type, &ring->req3, + sizeof(ring->req3)); + break; + } + + if (ret == -1) { + perror("setsockopt"); + exit(1); + } + + ring->rd_len = ring->rd_num * sizeof(*ring->rd); + ring->rd = malloc(ring->rd_len); + if (ring->rd == NULL) { + perror("malloc"); + exit(1); + } + + total_packets = 0; + total_bytes = 0; +} + +static void mmap_ring(int sock, struct ring *ring) +{ + int i; + + ring->mm_space = mmap(0, ring->mm_len, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_LOCKED | MAP_POPULATE, sock, 0); + if (ring->mm_space == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + memset(ring->rd, 0, ring->rd_len); + for (i = 0; i < ring->rd_num; ++i) { + ring->rd[i].iov_base = ring->mm_space + (i * ring->flen); + ring->rd[i].iov_len = ring->flen; + } +} + +static void bind_ring(int sock, struct ring *ring) +{ + int ret; + + ring->ll.sll_family = PF_PACKET; + ring->ll.sll_protocol = htons(ETH_P_ALL); + ring->ll.sll_ifindex = if_nametoindex("lo"); + ring->ll.sll_hatype = 0; + ring->ll.sll_pkttype = 0; + ring->ll.sll_halen = 0; + + ret = bind(sock, (struct sockaddr *) &ring->ll, sizeof(ring->ll)); + if (ret == -1) { + perror("bind"); + exit(1); + } +} + +static void walk_ring(int sock, struct ring *ring) +{ + ring->walk(sock, ring); +} + +static void unmap_ring(int sock, struct ring *ring) +{ + munmap(ring->mm_space, ring->mm_len); + free(ring->rd); +} + +static int test_kernel_bit_width(void) +{ + char in[512], *ptr; + int num = 0, fd; + ssize_t ret; + + fd = open("/proc/kallsyms", O_RDONLY); + if (fd == -1) { + perror("open"); + exit(1); + } + + ret = read(fd, in, sizeof(in)); + if (ret <= 0) { + perror("read"); + exit(1); + } + + close(fd); + + ptr = in; + while(!isspace(*ptr)) { + num++; + ptr++; + } + + return num * 4; +} + +static int test_user_bit_width(void) +{ + return __WORDSIZE; +} + +static const char *tpacket_str[] = { + [TPACKET_V1] = "TPACKET_V1", + [TPACKET_V2] = "TPACKET_V2", + [TPACKET_V3] = "TPACKET_V3", +}; + +static const char *type_str[] = { + [PACKET_RX_RING] = "PACKET_RX_RING", + [PACKET_TX_RING] = "PACKET_TX_RING", +}; + +static int test_tpacket(int version, int type) +{ + int sock; + struct ring ring; + + fprintf(stderr, "test: %s with %s ", tpacket_str[version], + type_str[type]); + fflush(stderr); + + if (version == TPACKET_V1 && + test_kernel_bit_width() != test_user_bit_width()) { + fprintf(stderr, "test: skip %s %s since user and kernel " + "space have different bit width\n", + tpacket_str[version], type_str[type]); + return 0; + } + + sock = pfsocket(version); + memset(&ring, 0, sizeof(ring)); + setup_ring(sock, &ring, version, type); + mmap_ring(sock, &ring); + bind_ring(sock, &ring); + walk_ring(sock, &ring); + unmap_ring(sock, &ring); + close(sock); + + fprintf(stderr, "\n"); + return 0; +} + +int main(void) +{ + int ret = 0; + + ret |= test_tpacket(TPACKET_V1, PACKET_RX_RING); + ret |= test_tpacket(TPACKET_V1, PACKET_TX_RING); + + ret |= test_tpacket(TPACKET_V2, PACKET_RX_RING); + ret |= test_tpacket(TPACKET_V2, PACKET_TX_RING); + + ret |= test_tpacket(TPACKET_V3, PACKET_RX_RING); + + if (ret) + return 1; + + printf("OK. All tests passed\n"); + return 0; +} diff --git a/tools/testing/selftests/net/run_afpackettests b/tools/testing/selftests/net/run_afpackettests new file mode 100644 index 000000000000..5246e782d6e8 --- /dev/null +++ b/tools/testing/selftests/net/run_afpackettests @@ -0,0 +1,26 @@ +#!/bin/sh + +if [ $(id -u) != 0 ]; then + echo $msg must be run as root >&2 + exit 0 +fi + +echo "--------------------" +echo "running psock_fanout test" +echo "--------------------" +./psock_fanout +if [ $? -ne 0 ]; then + echo "[FAIL]" +else + echo "[PASS]" +fi + +echo "--------------------" +echo "running psock_tpacket test" +echo "--------------------" +./psock_tpacket +if [ $? -ne 0 ]; then + echo "[FAIL]" +else + echo "[PASS]" +fi diff --git a/tools/testing/selftests/net/run_netsocktests b/tools/testing/selftests/net/run_netsocktests new file mode 100644 index 000000000000..c09a682df56a --- /dev/null +++ b/tools/testing/selftests/net/run_netsocktests @@ -0,0 +1,12 @@ +#!/bin/bash + +echo "--------------------" +echo "running socket test" +echo "--------------------" +./socket +if [ $? -ne 0 ]; then + echo "[FAIL]" +else + echo "[PASS]" +fi + diff --git a/tools/testing/selftests/net/socket.c b/tools/testing/selftests/net/socket.c new file mode 100644 index 000000000000..0f227f2f9be9 --- /dev/null +++ b/tools/testing/selftests/net/socket.c @@ -0,0 +1,92 @@ +#include <stdio.h> +#include <errno.h> +#include <unistd.h> +#include <string.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> + +struct socket_testcase { + int domain; + int type; + int protocol; + + /* 0 = valid file descriptor + * -foo = error foo + */ + int expect; + + /* If non-zero, accept EAFNOSUPPORT to handle the case + * of the protocol not being configured into the kernel. + */ + int nosupport_ok; +}; + +static struct socket_testcase tests[] = { + { AF_MAX, 0, 0, -EAFNOSUPPORT, 0 }, + { AF_INET, SOCK_STREAM, IPPROTO_TCP, 0, 1 }, + { AF_INET, SOCK_DGRAM, IPPROTO_TCP, -EPROTONOSUPPORT, 1 }, + { AF_INET, SOCK_DGRAM, IPPROTO_UDP, 0, 1 }, + { AF_INET, SOCK_STREAM, IPPROTO_UDP, -EPROTONOSUPPORT, 1 }, +}; + +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) +#define ERR_STRING_SZ 64 + +static int run_tests(void) +{ + char err_string1[ERR_STRING_SZ]; + char err_string2[ERR_STRING_SZ]; + int i, err; + + err = 0; + for (i = 0; i < ARRAY_SIZE(tests); i++) { + struct socket_testcase *s = &tests[i]; + int fd; + + fd = socket(s->domain, s->type, s->protocol); + if (fd < 0) { + if (s->nosupport_ok && + errno == EAFNOSUPPORT) + continue; + + if (s->expect < 0 && + errno == -s->expect) + continue; + + strerror_r(-s->expect, err_string1, ERR_STRING_SZ); + strerror_r(errno, err_string2, ERR_STRING_SZ); + + fprintf(stderr, "socket(%d, %d, %d) expected " + "err (%s) got (%s)\n", + s->domain, s->type, s->protocol, + err_string1, err_string2); + + err = -1; + break; + } else { + close(fd); + + if (s->expect < 0) { + strerror_r(errno, err_string1, ERR_STRING_SZ); + + fprintf(stderr, "socket(%d, %d, %d) expected " + "success got err (%s)\n", + s->domain, s->type, s->protocol, + err_string1); + + err = -1; + break; + } + } + } + + return err; +} + +int main(void) +{ + int err = run_tests(); + + return err; +} diff --git a/tools/testing/selftests/ptrace/Makefile b/tools/testing/selftests/ptrace/Makefile new file mode 100644 index 000000000000..47ae2d385ce8 --- /dev/null +++ b/tools/testing/selftests/ptrace/Makefile @@ -0,0 +1,10 @@ +CFLAGS += -iquote../../../../include/uapi -Wall +peeksiginfo: peeksiginfo.c + +all: peeksiginfo + +clean: + rm -f peeksiginfo + +run_tests: all + @./peeksiginfo || echo "peeksiginfo selftests: [FAIL]" diff --git a/tools/testing/selftests/ptrace/peeksiginfo.c b/tools/testing/selftests/ptrace/peeksiginfo.c new file mode 100644 index 000000000000..d46558b1f58d --- /dev/null +++ b/tools/testing/selftests/ptrace/peeksiginfo.c @@ -0,0 +1,214 @@ +#define _GNU_SOURCE +#include <stdio.h> +#include <signal.h> +#include <unistd.h> +#include <errno.h> +#include <linux/types.h> +#include <sys/wait.h> +#include <sys/syscall.h> +#include <sys/user.h> +#include <sys/mman.h> + +#include "linux/ptrace.h" + +static int sys_rt_sigqueueinfo(pid_t tgid, int sig, siginfo_t *uinfo) +{ + return syscall(SYS_rt_sigqueueinfo, tgid, sig, uinfo); +} + +static int sys_rt_tgsigqueueinfo(pid_t tgid, pid_t tid, + int sig, siginfo_t *uinfo) +{ + return syscall(SYS_rt_tgsigqueueinfo, tgid, tid, sig, uinfo); +} + +static int sys_ptrace(int request, pid_t pid, void *addr, void *data) +{ + return syscall(SYS_ptrace, request, pid, addr, data); +} + +#define SIGNR 10 +#define TEST_SICODE_PRIV -1 +#define TEST_SICODE_SHARE -2 + +#define err(fmt, ...) \ + fprintf(stderr, \ + "Error (%s:%d): " fmt, \ + __FILE__, __LINE__, ##__VA_ARGS__) + +static int check_error_paths(pid_t child) +{ + struct ptrace_peeksiginfo_args arg; + int ret, exit_code = -1; + void *addr_rw, *addr_ro; + + /* + * Allocate two contiguous pages. The first one is for read-write, + * another is for read-only. + */ + addr_rw = mmap(NULL, 2 * PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (addr_rw == MAP_FAILED) { + err("mmap() failed: %m\n"); + return 1; + } + + addr_ro = mmap(addr_rw + PAGE_SIZE, PAGE_SIZE, PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + if (addr_ro == MAP_FAILED) { + err("mmap() failed: %m\n"); + goto out; + } + + arg.nr = SIGNR; + arg.off = 0; + + /* Unsupported flags */ + arg.flags = ~0; + ret = sys_ptrace(PTRACE_PEEKSIGINFO, child, &arg, addr_rw); + if (ret != -1 || errno != EINVAL) { + err("sys_ptrace() returns %d (expected -1)," + " errno %d (expected %d): %m\n", + ret, errno, EINVAL); + goto out; + } + arg.flags = 0; + + /* A part of the buffer is read-only */ + ret = sys_ptrace(PTRACE_PEEKSIGINFO, child, &arg, + addr_ro - sizeof(siginfo_t) * 2); + if (ret != 2) { + err("sys_ptrace() returns %d (expected 2): %m\n", ret); + goto out; + } + + /* Read-only buffer */ + ret = sys_ptrace(PTRACE_PEEKSIGINFO, child, &arg, addr_ro); + if (ret != -1 && errno != EFAULT) { + err("sys_ptrace() returns %d (expected -1)," + " errno %d (expected %d): %m\n", + ret, errno, EFAULT); + goto out; + } + + exit_code = 0; +out: + munmap(addr_rw, 2 * PAGE_SIZE); + return exit_code; +} + +int check_direct_path(pid_t child, int shared, int nr) +{ + struct ptrace_peeksiginfo_args arg = {.flags = 0, .nr = nr, .off = 0}; + int i, j, ret, exit_code = -1; + siginfo_t siginfo[SIGNR]; + int si_code; + + if (shared == 1) { + arg.flags = PTRACE_PEEKSIGINFO_SHARED; + si_code = TEST_SICODE_SHARE; + } else { + arg.flags = 0; + si_code = TEST_SICODE_PRIV; + } + + for (i = 0; i < SIGNR; ) { + arg.off = i; + ret = sys_ptrace(PTRACE_PEEKSIGINFO, child, &arg, siginfo); + if (ret == -1) { + err("ptrace() failed: %m\n"); + goto out; + } + + if (ret == 0) + break; + + for (j = 0; j < ret; j++, i++) { + if (siginfo[j].si_code == si_code && + siginfo[j].si_int == i) + continue; + + err("%d: Wrong siginfo i=%d si_code=%d si_int=%d\n", + shared, i, siginfo[j].si_code, siginfo[j].si_int); + goto out; + } + } + + if (i != SIGNR) { + err("Only %d signals were read\n", i); + goto out; + } + + exit_code = 0; +out: + return exit_code; +} + +int main(int argc, char *argv[]) +{ + siginfo_t siginfo[SIGNR]; + int i, exit_code = 1; + sigset_t blockmask; + pid_t child; + + sigemptyset(&blockmask); + sigaddset(&blockmask, SIGRTMIN); + sigprocmask(SIG_BLOCK, &blockmask, NULL); + + child = fork(); + if (child == -1) { + err("fork() failed: %m"); + return 1; + } else if (child == 0) { + pid_t ppid = getppid(); + while (1) { + if (ppid != getppid()) + break; + sleep(1); + } + return 1; + } + + /* Send signals in process-wide and per-thread queues */ + for (i = 0; i < SIGNR; i++) { + siginfo->si_code = TEST_SICODE_SHARE; + siginfo->si_int = i; + sys_rt_sigqueueinfo(child, SIGRTMIN, siginfo); + + siginfo->si_code = TEST_SICODE_PRIV; + siginfo->si_int = i; + sys_rt_tgsigqueueinfo(child, child, SIGRTMIN, siginfo); + } + + if (sys_ptrace(PTRACE_ATTACH, child, NULL, NULL) == -1) + return 1; + + waitpid(child, NULL, 0); + + /* Dump signals one by one*/ + if (check_direct_path(child, 0, 1)) + goto out; + /* Dump all signals for one call */ + if (check_direct_path(child, 0, SIGNR)) + goto out; + + /* + * Dump signal from the process-wide queue. + * The number of signals is not multible to the buffer size + */ + if (check_direct_path(child, 1, 3)) + goto out; + + if (check_error_paths(child)) + goto out; + + printf("PASS\n"); + exit_code = 0; +out: + if (sys_ptrace(PTRACE_KILL, child, NULL, NULL) == -1) + return 1; + + waitpid(child, NULL, 0); + + return exit_code; +} diff --git a/tools/testing/selftests/soft-dirty/Makefile b/tools/testing/selftests/soft-dirty/Makefile new file mode 100644 index 000000000000..a9cdc823d6e0 --- /dev/null +++ b/tools/testing/selftests/soft-dirty/Makefile @@ -0,0 +1,10 @@ +CFLAGS += -iquote../../../../include/uapi -Wall +soft-dirty: soft-dirty.c + +all: soft-dirty + +clean: + rm -f soft-dirty + +run_tests: all + @./soft-dirty || echo "soft-dirty selftests: [FAIL]" diff --git a/tools/testing/selftests/soft-dirty/soft-dirty.c b/tools/testing/selftests/soft-dirty/soft-dirty.c new file mode 100644 index 000000000000..aba4f87f87f0 --- /dev/null +++ b/tools/testing/selftests/soft-dirty/soft-dirty.c @@ -0,0 +1,114 @@ +#include <stdlib.h> +#include <stdio.h> +#include <sys/mman.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> + +typedef unsigned long long u64; + +#define PME_PRESENT (1ULL << 63) +#define PME_SOFT_DIRTY (1Ull << 55) + +#define PAGES_TO_TEST 3 +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +static void get_pagemap2(char *mem, u64 *map) +{ + int fd; + + fd = open("/proc/self/pagemap2", O_RDONLY); + if (fd < 0) { + perror("Can't open pagemap2"); + exit(1); + } + + lseek(fd, (unsigned long)mem / PAGE_SIZE * sizeof(u64), SEEK_SET); + read(fd, map, sizeof(u64) * PAGES_TO_TEST); + close(fd); +} + +static inline char map_p(u64 map) +{ + return map & PME_PRESENT ? 'p' : '-'; +} + +static inline char map_sd(u64 map) +{ + return map & PME_SOFT_DIRTY ? 'd' : '-'; +} + +static int check_pte(int step, int page, u64 *map, u64 want) +{ + if ((map[page] & want) != want) { + printf("Step %d Page %d has %c%c, want %c%c\n", + step, page, + map_p(map[page]), map_sd(map[page]), + map_p(want), map_sd(want)); + return 1; + } + + return 0; +} + +static void clear_refs(void) +{ + int fd; + char *v = "4"; + + fd = open("/proc/self/clear_refs", O_WRONLY); + if (write(fd, v, 3) < 3) { + perror("Can't clear soft-dirty bit"); + exit(1); + } + close(fd); +} + +int main(void) +{ + char *mem, x; + u64 map[PAGES_TO_TEST]; + + mem = mmap(NULL, PAGES_TO_TEST * PAGE_SIZE, + PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, 0, 0); + + x = mem[0]; + mem[2 * PAGE_SIZE] = 'c'; + get_pagemap2(mem, map); + + if (check_pte(1, 0, map, PME_PRESENT)) + return 1; + if (check_pte(1, 1, map, 0)) + return 1; + if (check_pte(1, 2, map, PME_PRESENT | PME_SOFT_DIRTY)) + return 1; + + clear_refs(); + get_pagemap2(mem, map); + + if (check_pte(2, 0, map, PME_PRESENT)) + return 1; + if (check_pte(2, 1, map, 0)) + return 1; + if (check_pte(2, 2, map, PME_PRESENT)) + return 1; + + mem[0] = 'a'; + mem[PAGE_SIZE] = 'b'; + x = mem[2 * PAGE_SIZE]; + get_pagemap2(mem, map); + + if (check_pte(3, 0, map, PME_PRESENT | PME_SOFT_DIRTY)) + return 1; + if (check_pte(3, 1, map, PME_PRESENT | PME_SOFT_DIRTY)) + return 1; + if (check_pte(3, 2, map, PME_PRESENT)) + return 1; + + (void)x; /* gcc warn */ + + printf("PASS\n"); + return 0; +} diff --git a/tools/virtio/Makefile b/tools/virtio/Makefile index d1d442ed106a..3187c62d9814 100644 --- a/tools/virtio/Makefile +++ b/tools/virtio/Makefile @@ -1,12 +1,14 @@ all: test mod -test: virtio_test +test: virtio_test vringh_test virtio_test: virtio_ring.o virtio_test.o -CFLAGS += -g -O2 -Wall -I. -I ../../usr/include/ -Wno-pointer-sign -fno-strict-overflow -MMD -vpath %.c ../../drivers/virtio +vringh_test: vringh_test.o vringh.o virtio_ring.o + +CFLAGS += -g -O2 -Wall -I. -I ../../usr/include/ -Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD -U_FORTIFY_SOURCE +vpath %.c ../../drivers/virtio ../../drivers/vhost mod: ${MAKE} -C `pwd`/../.. M=`pwd`/vhost_test .PHONY: all test mod clean clean: - ${RM} *.o vhost_test/*.o vhost_test/.*.cmd \ + ${RM} *.o vringh_test virtio_test vhost_test/*.o vhost_test/.*.cmd \ vhost_test/Module.symvers vhost_test/modules.order *.d -include *.d diff --git a/tools/virtio/asm/barrier.h b/tools/virtio/asm/barrier.h new file mode 100644 index 000000000000..aff61e13306c --- /dev/null +++ b/tools/virtio/asm/barrier.h @@ -0,0 +1,14 @@ +#if defined(__i386__) || defined(__x86_64__) +#define barrier() asm volatile("" ::: "memory") +#define mb() __sync_synchronize() + +#define smp_mb() mb() +# define smp_rmb() barrier() +# define smp_wmb() barrier() +/* Weak barriers should be used. If not - it's a bug */ +# define rmb() abort() +# define wmb() abort() +#else +#error Please fill in barrier macros +#endif + diff --git a/tools/virtio/linux/bug.h b/tools/virtio/linux/bug.h new file mode 100644 index 000000000000..fb94f0787c47 --- /dev/null +++ b/tools/virtio/linux/bug.h @@ -0,0 +1,10 @@ +#ifndef BUG_H +#define BUG_H + +#define BUG_ON(__BUG_ON_cond) assert(!(__BUG_ON_cond)) + +#define BUILD_BUG_ON(x) + +#define BUG() abort() + +#endif /* BUG_H */ diff --git a/tools/virtio/linux/err.h b/tools/virtio/linux/err.h new file mode 100644 index 000000000000..e32eff8b2a14 --- /dev/null +++ b/tools/virtio/linux/err.h @@ -0,0 +1,26 @@ +#ifndef ERR_H +#define ERR_H +#define MAX_ERRNO 4095 + +#define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO) + +static inline void * __must_check ERR_PTR(long error) +{ + return (void *) error; +} + +static inline long __must_check PTR_ERR(const void *ptr) +{ + return (long) ptr; +} + +static inline long __must_check IS_ERR(const void *ptr) +{ + return IS_ERR_VALUE((unsigned long)ptr); +} + +static inline long __must_check IS_ERR_OR_NULL(const void *ptr) +{ + return !ptr || IS_ERR_VALUE((unsigned long)ptr); +} +#endif /* ERR_H */ diff --git a/tools/virtio/linux/export.h b/tools/virtio/linux/export.h new file mode 100644 index 000000000000..7311d326894a --- /dev/null +++ b/tools/virtio/linux/export.h @@ -0,0 +1,5 @@ +#define EXPORT_SYMBOL(sym) +#define EXPORT_SYMBOL_GPL(sym) +#define EXPORT_SYMBOL_GPL_FUTURE(sym) +#define EXPORT_UNUSED_SYMBOL(sym) +#define EXPORT_UNUSED_SYMBOL_GPL(sym) diff --git a/tools/virtio/linux/irqreturn.h b/tools/virtio/linux/irqreturn.h new file mode 100644 index 000000000000..a3c4e7be7089 --- /dev/null +++ b/tools/virtio/linux/irqreturn.h @@ -0,0 +1 @@ +#include "../../../include/linux/irqreturn.h" diff --git a/tools/virtio/linux/kernel.h b/tools/virtio/linux/kernel.h new file mode 100644 index 000000000000..fba705963968 --- /dev/null +++ b/tools/virtio/linux/kernel.h @@ -0,0 +1,112 @@ +#ifndef KERNEL_H +#define KERNEL_H +#include <stdbool.h> +#include <stdlib.h> +#include <stddef.h> +#include <stdio.h> +#include <string.h> +#include <assert.h> +#include <stdarg.h> + +#include <linux/types.h> +#include <linux/printk.h> +#include <linux/bug.h> +#include <errno.h> +#include <unistd.h> +#include <asm/barrier.h> + +#define CONFIG_SMP + +#define PAGE_SIZE getpagesize() +#define PAGE_MASK (~(PAGE_SIZE-1)) + +typedef unsigned long long dma_addr_t; +typedef size_t __kernel_size_t; + +struct page { + unsigned long long dummy; +}; + +/* Physical == Virtual */ +#define virt_to_phys(p) ((unsigned long)p) +#define phys_to_virt(a) ((void *)(unsigned long)(a)) +/* Page address: Virtual / 4K */ +#define page_to_phys(p) ((dma_addr_t)(unsigned long)(p)) +#define virt_to_page(p) ((struct page *)((unsigned long)p & PAGE_MASK)) + +#define offset_in_page(p) (((unsigned long)p) % PAGE_SIZE) + +#define __printf(a,b) __attribute__((format(printf,a,b))) + +typedef enum { + GFP_KERNEL, + GFP_ATOMIC, + __GFP_HIGHMEM, + __GFP_HIGH +} gfp_t; + +#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0])) + +extern void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end; +static inline void *kmalloc(size_t s, gfp_t gfp) +{ + if (__kmalloc_fake) + return __kmalloc_fake; + return malloc(s); +} + +static inline void kfree(void *p) +{ + if (p >= __kfree_ignore_start && p < __kfree_ignore_end) + return; + free(p); +} + +static inline void *krealloc(void *p, size_t s, gfp_t gfp) +{ + return realloc(p, s); +} + + +static inline unsigned long __get_free_page(gfp_t gfp) +{ + void *p; + + posix_memalign(&p, PAGE_SIZE, PAGE_SIZE); + return (unsigned long)p; +} + +static inline void free_page(unsigned long addr) +{ + free((void *)addr); +} + +#define container_of(ptr, type, member) ({ \ + const typeof( ((type *)0)->member ) *__mptr = (ptr); \ + (type *)( (char *)__mptr - offsetof(type,member) );}) + +#define uninitialized_var(x) x = x + +# ifndef likely +# define likely(x) (__builtin_expect(!!(x), 1)) +# endif +# ifndef unlikely +# define unlikely(x) (__builtin_expect(!!(x), 0)) +# endif + +#define pr_err(format, ...) fprintf (stderr, format, ## __VA_ARGS__) +#ifdef DEBUG +#define pr_debug(format, ...) fprintf (stderr, format, ## __VA_ARGS__) +#else +#define pr_debug(format, ...) do {} while (0) +#endif +#define dev_err(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__) +#define dev_warn(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__) + +#define min(x, y) ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void) (&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; }) + +#endif /* KERNEL_H */ diff --git a/tools/virtio/linux/module.h b/tools/virtio/linux/module.h index e69de29bb2d1..3039a7e972b6 100644 --- a/tools/virtio/linux/module.h +++ b/tools/virtio/linux/module.h @@ -0,0 +1 @@ +#include <linux/export.h> diff --git a/tools/virtio/linux/printk.h b/tools/virtio/linux/printk.h new file mode 100644 index 000000000000..9f2423bd89c2 --- /dev/null +++ b/tools/virtio/linux/printk.h @@ -0,0 +1,4 @@ +#include "../../../include/linux/kern_levels.h" + +#define printk printf +#define vprintk vprintf diff --git a/tools/virtio/linux/ratelimit.h b/tools/virtio/linux/ratelimit.h new file mode 100644 index 000000000000..dcce1725f90d --- /dev/null +++ b/tools/virtio/linux/ratelimit.h @@ -0,0 +1,4 @@ +#define DEFINE_RATELIMIT_STATE(name, interval_init, burst_init) int name = 0 + +#define __ratelimit(x) (*(x)) + diff --git a/tools/virtio/linux/scatterlist.h b/tools/virtio/linux/scatterlist.h new file mode 100644 index 000000000000..68c9e2adc996 --- /dev/null +++ b/tools/virtio/linux/scatterlist.h @@ -0,0 +1,189 @@ +#ifndef SCATTERLIST_H +#define SCATTERLIST_H +#include <linux/kernel.h> + +struct scatterlist { + unsigned long page_link; + unsigned int offset; + unsigned int length; + dma_addr_t dma_address; +}; + +/* Scatterlist helpers, stolen from linux/scatterlist.h */ +#define sg_is_chain(sg) ((sg)->page_link & 0x01) +#define sg_is_last(sg) ((sg)->page_link & 0x02) +#define sg_chain_ptr(sg) \ + ((struct scatterlist *) ((sg)->page_link & ~0x03)) + +/** + * sg_assign_page - Assign a given page to an SG entry + * @sg: SG entry + * @page: The page + * + * Description: + * Assign page to sg entry. Also see sg_set_page(), the most commonly used + * variant. + * + **/ +static inline void sg_assign_page(struct scatterlist *sg, struct page *page) +{ + unsigned long page_link = sg->page_link & 0x3; + + /* + * In order for the low bit stealing approach to work, pages + * must be aligned at a 32-bit boundary as a minimum. + */ + BUG_ON((unsigned long) page & 0x03); +#ifdef CONFIG_DEBUG_SG + BUG_ON(sg->sg_magic != SG_MAGIC); + BUG_ON(sg_is_chain(sg)); +#endif + sg->page_link = page_link | (unsigned long) page; +} + +/** + * sg_set_page - Set sg entry to point at given page + * @sg: SG entry + * @page: The page + * @len: Length of data + * @offset: Offset into page + * + * Description: + * Use this function to set an sg entry pointing at a page, never assign + * the page directly. We encode sg table information in the lower bits + * of the page pointer. See sg_page() for looking up the page belonging + * to an sg entry. + * + **/ +static inline void sg_set_page(struct scatterlist *sg, struct page *page, + unsigned int len, unsigned int offset) +{ + sg_assign_page(sg, page); + sg->offset = offset; + sg->length = len; +} + +static inline struct page *sg_page(struct scatterlist *sg) +{ +#ifdef CONFIG_DEBUG_SG + BUG_ON(sg->sg_magic != SG_MAGIC); + BUG_ON(sg_is_chain(sg)); +#endif + return (struct page *)((sg)->page_link & ~0x3); +} + +/* + * Loop over each sg element, following the pointer to a new list if necessary + */ +#define for_each_sg(sglist, sg, nr, __i) \ + for (__i = 0, sg = (sglist); __i < (nr); __i++, sg = sg_next(sg)) + +/** + * sg_chain - Chain two sglists together + * @prv: First scatterlist + * @prv_nents: Number of entries in prv + * @sgl: Second scatterlist + * + * Description: + * Links @prv@ and @sgl@ together, to form a longer scatterlist. + * + **/ +static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents, + struct scatterlist *sgl) +{ + /* + * offset and length are unused for chain entry. Clear them. + */ + prv[prv_nents - 1].offset = 0; + prv[prv_nents - 1].length = 0; + + /* + * Set lowest bit to indicate a link pointer, and make sure to clear + * the termination bit if it happens to be set. + */ + prv[prv_nents - 1].page_link = ((unsigned long) sgl | 0x01) & ~0x02; +} + +/** + * sg_mark_end - Mark the end of the scatterlist + * @sg: SG entryScatterlist + * + * Description: + * Marks the passed in sg entry as the termination point for the sg + * table. A call to sg_next() on this entry will return NULL. + * + **/ +static inline void sg_mark_end(struct scatterlist *sg) +{ +#ifdef CONFIG_DEBUG_SG + BUG_ON(sg->sg_magic != SG_MAGIC); +#endif + /* + * Set termination bit, clear potential chain bit + */ + sg->page_link |= 0x02; + sg->page_link &= ~0x01; +} + +/** + * sg_unmark_end - Undo setting the end of the scatterlist + * @sg: SG entryScatterlist + * + * Description: + * Removes the termination marker from the given entry of the scatterlist. + * + **/ +static inline void sg_unmark_end(struct scatterlist *sg) +{ +#ifdef CONFIG_DEBUG_SG + BUG_ON(sg->sg_magic != SG_MAGIC); +#endif + sg->page_link &= ~0x02; +} + +static inline struct scatterlist *sg_next(struct scatterlist *sg) +{ +#ifdef CONFIG_DEBUG_SG + BUG_ON(sg->sg_magic != SG_MAGIC); +#endif + if (sg_is_last(sg)) + return NULL; + + sg++; + if (unlikely(sg_is_chain(sg))) + sg = sg_chain_ptr(sg); + + return sg; +} + +static inline void sg_init_table(struct scatterlist *sgl, unsigned int nents) +{ + memset(sgl, 0, sizeof(*sgl) * nents); +#ifdef CONFIG_DEBUG_SG + { + unsigned int i; + for (i = 0; i < nents; i++) + sgl[i].sg_magic = SG_MAGIC; + } +#endif + sg_mark_end(&sgl[nents - 1]); +} + +static inline dma_addr_t sg_phys(struct scatterlist *sg) +{ + return page_to_phys(sg_page(sg)) + sg->offset; +} + +static inline void sg_set_buf(struct scatterlist *sg, const void *buf, + unsigned int buflen) +{ + sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf)); +} + +static inline void sg_init_one(struct scatterlist *sg, + const void *buf, unsigned int buflen) +{ + sg_init_table(sg, 1); + sg_set_buf(sg, buf, buflen); +} +#endif /* SCATTERLIST_H */ diff --git a/tools/virtio/linux/types.h b/tools/virtio/linux/types.h new file mode 100644 index 000000000000..f8ebb9a2b3d6 --- /dev/null +++ b/tools/virtio/linux/types.h @@ -0,0 +1,28 @@ +#ifndef TYPES_H +#define TYPES_H +#include <stdint.h> + +#define __force +#define __user +#define __must_check +#define __cold + +typedef uint64_t u64; +typedef int64_t s64; +typedef uint32_t u32; +typedef int32_t s32; +typedef uint16_t u16; +typedef int16_t s16; +typedef uint8_t u8; +typedef int8_t s8; + +typedef uint64_t __u64; +typedef int64_t __s64; +typedef uint32_t __u32; +typedef int32_t __s32; +typedef uint16_t __u16; +typedef int16_t __s16; +typedef uint8_t __u8; +typedef int8_t __s8; + +#endif /* TYPES_H */ diff --git a/tools/virtio/linux/uaccess.h b/tools/virtio/linux/uaccess.h new file mode 100644 index 000000000000..0a578fe18653 --- /dev/null +++ b/tools/virtio/linux/uaccess.h @@ -0,0 +1,50 @@ +#ifndef UACCESS_H +#define UACCESS_H +extern void *__user_addr_min, *__user_addr_max; + +#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) + +static inline void __chk_user_ptr(const volatile void *p, size_t size) +{ + assert(p >= __user_addr_min && p + size <= __user_addr_max); +} + +#define put_user(x, ptr) \ +({ \ + typeof(ptr) __pu_ptr = (ptr); \ + __chk_user_ptr(__pu_ptr, sizeof(*__pu_ptr)); \ + ACCESS_ONCE(*(__pu_ptr)) = x; \ + 0; \ +}) + +#define get_user(x, ptr) \ +({ \ + typeof(ptr) __pu_ptr = (ptr); \ + __chk_user_ptr(__pu_ptr, sizeof(*__pu_ptr)); \ + x = ACCESS_ONCE(*(__pu_ptr)); \ + 0; \ +}) + +static void volatile_memcpy(volatile char *to, const volatile char *from, + unsigned long n) +{ + while (n--) + *(to++) = *(from++); +} + +static inline int copy_from_user(void *to, const void __user volatile *from, + unsigned long n) +{ + __chk_user_ptr(from, n); + volatile_memcpy(to, from, n); + return 0; +} + +static inline int copy_to_user(void __user volatile *to, const void *from, + unsigned long n) +{ + __chk_user_ptr(to, n); + volatile_memcpy(to, from, n); + return 0; +} +#endif /* UACCESS_H */ diff --git a/tools/virtio/linux/uio.h b/tools/virtio/linux/uio.h new file mode 100644 index 000000000000..cd20f0ba3081 --- /dev/null +++ b/tools/virtio/linux/uio.h @@ -0,0 +1,3 @@ +#include <linux/kernel.h> + +#include "../../../include/linux/uio.h" diff --git a/tools/virtio/linux/virtio.h b/tools/virtio/linux/virtio.h index 81847dd08bd0..cd801838156f 100644 --- a/tools/virtio/linux/virtio.h +++ b/tools/virtio/linux/virtio.h @@ -1,127 +1,7 @@ #ifndef LINUX_VIRTIO_H #define LINUX_VIRTIO_H - -#include <stdbool.h> -#include <stdlib.h> -#include <stddef.h> -#include <stdio.h> -#include <string.h> -#include <assert.h> - -#include <linux/types.h> -#include <errno.h> - -typedef unsigned long long dma_addr_t; - -struct scatterlist { - unsigned long page_link; - unsigned int offset; - unsigned int length; - dma_addr_t dma_address; -}; - -struct page { - unsigned long long dummy; -}; - -#define BUG_ON(__BUG_ON_cond) assert(!(__BUG_ON_cond)) - -/* Physical == Virtual */ -#define virt_to_phys(p) ((unsigned long)p) -#define phys_to_virt(a) ((void *)(unsigned long)(a)) -/* Page address: Virtual / 4K */ -#define virt_to_page(p) ((struct page*)((virt_to_phys(p) / 4096) * \ - sizeof(struct page))) -#define offset_in_page(p) (((unsigned long)p) % 4096) -#define sg_phys(sg) ((sg->page_link & ~0x3) / sizeof(struct page) * 4096 + \ - sg->offset) -static inline void sg_mark_end(struct scatterlist *sg) -{ - /* - * Set termination bit, clear potential chain bit - */ - sg->page_link |= 0x02; - sg->page_link &= ~0x01; -} -static inline void sg_init_table(struct scatterlist *sgl, unsigned int nents) -{ - memset(sgl, 0, sizeof(*sgl) * nents); - sg_mark_end(&sgl[nents - 1]); -} -static inline void sg_assign_page(struct scatterlist *sg, struct page *page) -{ - unsigned long page_link = sg->page_link & 0x3; - - /* - * In order for the low bit stealing approach to work, pages - * must be aligned at a 32-bit boundary as a minimum. - */ - BUG_ON((unsigned long) page & 0x03); - sg->page_link = page_link | (unsigned long) page; -} - -static inline void sg_set_page(struct scatterlist *sg, struct page *page, - unsigned int len, unsigned int offset) -{ - sg_assign_page(sg, page); - sg->offset = offset; - sg->length = len; -} - -static inline void sg_set_buf(struct scatterlist *sg, const void *buf, - unsigned int buflen) -{ - sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf)); -} - -static inline void sg_init_one(struct scatterlist *sg, const void *buf, unsigned int buflen) -{ - sg_init_table(sg, 1); - sg_set_buf(sg, buf, buflen); -} - -typedef __u16 u16; - -typedef enum { - GFP_KERNEL, - GFP_ATOMIC, -} gfp_t; -typedef enum { - IRQ_NONE, - IRQ_HANDLED -} irqreturn_t; - -static inline void *kmalloc(size_t s, gfp_t gfp) -{ - return malloc(s); -} - -static inline void kfree(void *p) -{ - free(p); -} - -#define container_of(ptr, type, member) ({ \ - const typeof( ((type *)0)->member ) *__mptr = (ptr); \ - (type *)( (char *)__mptr - offsetof(type,member) );}) - -#define uninitialized_var(x) x = x - -# ifndef likely -# define likely(x) (__builtin_expect(!!(x), 1)) -# endif -# ifndef unlikely -# define unlikely(x) (__builtin_expect(!!(x), 0)) -# endif - -#define pr_err(format, ...) fprintf (stderr, format, ## __VA_ARGS__) -#ifdef DEBUG -#define pr_debug(format, ...) fprintf (stderr, format, ## __VA_ARGS__) -#else -#define pr_debug(format, ...) do {} while (0) -#endif -#define dev_err(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__) -#define dev_warn(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__) +#include <linux/scatterlist.h> +#include <linux/kernel.h> /* TODO: empty stubs for now. Broken but enough for virtio_ring.c */ #define list_add_tail(a, b) do {} while (0) @@ -131,6 +11,7 @@ static inline void kfree(void *p) #define BITS_PER_BYTE 8 #define BITS_PER_LONG (sizeof(long) * BITS_PER_BYTE) #define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) + /* TODO: Not atomic as it should be: * we don't use this for anything important. */ static inline void clear_bit(int nr, volatile unsigned long *addr) @@ -145,10 +26,6 @@ static inline int test_bit(int nr, const volatile unsigned long *addr) { return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); } - -/* The only feature we care to support */ -#define virtio_has_feature(dev, feature) \ - test_bit((feature), (dev)->features) /* end of stubs */ struct virtio_device { @@ -163,39 +40,32 @@ struct virtqueue { void (*callback)(struct virtqueue *vq); const char *name; struct virtio_device *vdev; + unsigned int index; + unsigned int num_free; void *priv; }; -#define EXPORT_SYMBOL_GPL(__EXPORT_SYMBOL_GPL_name) \ - void __EXPORT_SYMBOL_GPL##__EXPORT_SYMBOL_GPL_name() { \ -} #define MODULE_LICENSE(__MODULE_LICENSE_value) \ const char *__MODULE_LICENSE_name = __MODULE_LICENSE_value -#define CONFIG_SMP - -#if defined(__i386__) || defined(__x86_64__) -#define barrier() asm volatile("" ::: "memory") -#define mb() __sync_synchronize() - -#define smp_mb() mb() -# define smp_rmb() barrier() -# define smp_wmb() barrier() -/* Weak barriers should be used. If not - it's a bug */ -# define rmb() abort() -# define wmb() abort() -#else -#error Please fill in barrier macros -#endif - /* Interfaces exported by virtio_ring. */ -int virtqueue_add_buf(struct virtqueue *vq, - struct scatterlist sg[], - unsigned int out_num, - unsigned int in_num, +int virtqueue_add_sgs(struct virtqueue *vq, + struct scatterlist *sgs[], + unsigned int out_sgs, + unsigned int in_sgs, void *data, gfp_t gfp); +int virtqueue_add_outbuf(struct virtqueue *vq, + struct scatterlist sg[], unsigned int num, + void *data, + gfp_t gfp); + +int virtqueue_add_inbuf(struct virtqueue *vq, + struct scatterlist sg[], unsigned int num, + void *data, + gfp_t gfp); + void virtqueue_kick(struct virtqueue *vq); void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len); @@ -206,7 +76,8 @@ bool virtqueue_enable_cb(struct virtqueue *vq); bool virtqueue_enable_cb_delayed(struct virtqueue *vq); void *virtqueue_detach_unused_buf(struct virtqueue *vq); -struct virtqueue *vring_new_virtqueue(unsigned int num, +struct virtqueue *vring_new_virtqueue(unsigned int index, + unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, diff --git a/tools/virtio/linux/virtio_config.h b/tools/virtio/linux/virtio_config.h new file mode 100644 index 000000000000..5049967f99f7 --- /dev/null +++ b/tools/virtio/linux/virtio_config.h @@ -0,0 +1,6 @@ +#define VIRTIO_TRANSPORT_F_START 28 +#define VIRTIO_TRANSPORT_F_END 32 + +#define virtio_has_feature(dev, feature) \ + test_bit((feature), (dev)->features) + diff --git a/tools/virtio/linux/virtio_ring.h b/tools/virtio/linux/virtio_ring.h new file mode 100644 index 000000000000..8949c4e2772c --- /dev/null +++ b/tools/virtio/linux/virtio_ring.h @@ -0,0 +1 @@ +#include "../../../include/linux/virtio_ring.h" diff --git a/tools/virtio/linux/vringh.h b/tools/virtio/linux/vringh.h new file mode 100644 index 000000000000..9348957be56e --- /dev/null +++ b/tools/virtio/linux/vringh.h @@ -0,0 +1 @@ +#include "../../../include/linux/vringh.h" diff --git a/tools/virtio/uapi/linux/uio.h b/tools/virtio/uapi/linux/uio.h new file mode 100644 index 000000000000..7230e9002207 --- /dev/null +++ b/tools/virtio/uapi/linux/uio.h @@ -0,0 +1 @@ +#include <sys/uio.h> diff --git a/tools/virtio/uapi/linux/virtio_config.h b/tools/virtio/uapi/linux/virtio_config.h new file mode 100644 index 000000000000..4c86675f0159 --- /dev/null +++ b/tools/virtio/uapi/linux/virtio_config.h @@ -0,0 +1 @@ +#include "../../../../include/uapi/linux/virtio_config.h" diff --git a/tools/virtio/uapi/linux/virtio_ring.h b/tools/virtio/uapi/linux/virtio_ring.h new file mode 100644 index 000000000000..4d99c78234d3 --- /dev/null +++ b/tools/virtio/uapi/linux/virtio_ring.h @@ -0,0 +1,4 @@ +#ifndef VIRTIO_RING_H +#define VIRTIO_RING_H +#include "../../../../include/uapi/linux/virtio_ring.h" +#endif /* VIRTIO_RING_H */ diff --git a/tools/virtio/virtio_test.c b/tools/virtio/virtio_test.c index fcc9aa25fd08..da7a19558281 100644 --- a/tools/virtio/virtio_test.c +++ b/tools/virtio/virtio_test.c @@ -10,11 +10,15 @@ #include <sys/stat.h> #include <sys/types.h> #include <fcntl.h> +#include <stdbool.h> #include <linux/vhost.h> #include <linux/virtio.h> #include <linux/virtio_ring.h> #include "../../drivers/vhost/test.h" +/* Unused */ +void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end; + struct vq_info { int kick; int call; @@ -92,7 +96,8 @@ static void vq_info_add(struct vdev_info *dev, int num) assert(r >= 0); memset(info->ring, 0, vring_size(num, 4096)); vring_init(&info->vring, num, info->ring, 4096); - info->vq = vring_new_virtqueue(info->vring.num, 4096, &dev->vdev, + info->vq = vring_new_virtqueue(info->idx, + info->vring.num, 4096, &dev->vdev, true, info->ring, vq_notify, vq_callback, "test"); assert(info->vq); @@ -161,9 +166,9 @@ static void run_test(struct vdev_info *dev, struct vq_info *vq, do { if (started < bufs) { sg_init_one(&sl, dev->buf, dev->buf_size); - r = virtqueue_add_buf(vq->vq, &sl, 1, 0, - dev->buf + started, - GFP_ATOMIC); + r = virtqueue_add_outbuf(vq->vq, &sl, 1, + dev->buf + started, + GFP_ATOMIC); if (likely(r == 0)) { ++started; virtqueue_kick(vq->vq); diff --git a/tools/virtio/vringh_test.c b/tools/virtio/vringh_test.c new file mode 100644 index 000000000000..d053ea40c001 --- /dev/null +++ b/tools/virtio/vringh_test.c @@ -0,0 +1,741 @@ +/* Simple test of virtio code, entirely in userpsace. */ +#define _GNU_SOURCE +#include <sched.h> +#include <err.h> +#include <linux/kernel.h> +#include <linux/err.h> +#include <linux/virtio.h> +#include <linux/vringh.h> +#include <linux/virtio_ring.h> +#include <linux/uaccess.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <sys/wait.h> +#include <fcntl.h> + +#define USER_MEM (1024*1024) +void *__user_addr_min, *__user_addr_max; +void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end; +static u64 user_addr_offset; + +#define RINGSIZE 256 +#define ALIGN 4096 + +static void never_notify_host(struct virtqueue *vq) +{ + abort(); +} + +static void never_callback_guest(struct virtqueue *vq) +{ + abort(); +} + +static bool getrange_iov(struct vringh *vrh, u64 addr, struct vringh_range *r) +{ + if (addr < (u64)(unsigned long)__user_addr_min - user_addr_offset) + return false; + if (addr >= (u64)(unsigned long)__user_addr_max - user_addr_offset) + return false; + + r->start = (u64)(unsigned long)__user_addr_min - user_addr_offset; + r->end_incl = (u64)(unsigned long)__user_addr_max - 1 - user_addr_offset; + r->offset = user_addr_offset; + return true; +} + +/* We return single byte ranges. */ +static bool getrange_slow(struct vringh *vrh, u64 addr, struct vringh_range *r) +{ + if (addr < (u64)(unsigned long)__user_addr_min - user_addr_offset) + return false; + if (addr >= (u64)(unsigned long)__user_addr_max - user_addr_offset) + return false; + + r->start = addr; + r->end_incl = r->start; + r->offset = user_addr_offset; + return true; +} + +struct guest_virtio_device { + struct virtio_device vdev; + int to_host_fd; + unsigned long notifies; +}; + +static void parallel_notify_host(struct virtqueue *vq) +{ + struct guest_virtio_device *gvdev; + + gvdev = container_of(vq->vdev, struct guest_virtio_device, vdev); + write(gvdev->to_host_fd, "", 1); + gvdev->notifies++; +} + +static void no_notify_host(struct virtqueue *vq) +{ +} + +#define NUM_XFERS (10000000) + +/* We aim for two "distant" cpus. */ +static void find_cpus(unsigned int *first, unsigned int *last) +{ + unsigned int i; + + *first = -1U; + *last = 0; + for (i = 0; i < 4096; i++) { + cpu_set_t set; + CPU_ZERO(&set); + CPU_SET(i, &set); + if (sched_setaffinity(getpid(), sizeof(set), &set) == 0) { + if (i < *first) + *first = i; + if (i > *last) + *last = i; + } + } +} + +/* Opencoded version for fast mode */ +static inline int vringh_get_head(struct vringh *vrh, u16 *head) +{ + u16 avail_idx, i; + int err; + + err = get_user(avail_idx, &vrh->vring.avail->idx); + if (err) + return err; + + if (vrh->last_avail_idx == avail_idx) + return 0; + + /* Only get avail ring entries after they have been exposed by guest. */ + virtio_rmb(vrh->weak_barriers); + + i = vrh->last_avail_idx & (vrh->vring.num - 1); + + err = get_user(*head, &vrh->vring.avail->ring[i]); + if (err) + return err; + + vrh->last_avail_idx++; + return 1; +} + +static int parallel_test(unsigned long features, + bool (*getrange)(struct vringh *vrh, + u64 addr, struct vringh_range *r), + bool fast_vringh) +{ + void *host_map, *guest_map; + int fd, mapsize, to_guest[2], to_host[2]; + unsigned long xfers = 0, notifies = 0, receives = 0; + unsigned int first_cpu, last_cpu; + cpu_set_t cpu_set; + char buf[128]; + + /* Create real file to mmap. */ + fd = open("/tmp/vringh_test-file", O_RDWR|O_CREAT|O_TRUNC, 0600); + if (fd < 0) + err(1, "Opening /tmp/vringh_test-file"); + + /* Extra room at the end for some data, and indirects */ + mapsize = vring_size(RINGSIZE, ALIGN) + + RINGSIZE * 2 * sizeof(int) + + RINGSIZE * 6 * sizeof(struct vring_desc); + mapsize = (mapsize + getpagesize() - 1) & ~(getpagesize() - 1); + ftruncate(fd, mapsize); + + /* Parent and child use separate addresses, to check our mapping logic! */ + host_map = mmap(NULL, mapsize, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + guest_map = mmap(NULL, mapsize, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + + pipe(to_guest); + pipe(to_host); + + CPU_ZERO(&cpu_set); + find_cpus(&first_cpu, &last_cpu); + printf("Using CPUS %u and %u\n", first_cpu, last_cpu); + fflush(stdout); + + if (fork() != 0) { + struct vringh vrh; + int status, err, rlen = 0; + char rbuf[5]; + + /* We are the host: never access guest addresses! */ + munmap(guest_map, mapsize); + + __user_addr_min = host_map; + __user_addr_max = __user_addr_min + mapsize; + user_addr_offset = host_map - guest_map; + assert(user_addr_offset); + + close(to_guest[0]); + close(to_host[1]); + + vring_init(&vrh.vring, RINGSIZE, host_map, ALIGN); + vringh_init_user(&vrh, features, RINGSIZE, true, + vrh.vring.desc, vrh.vring.avail, vrh.vring.used); + CPU_SET(first_cpu, &cpu_set); + if (sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set)) + errx(1, "Could not set affinity to cpu %u", first_cpu); + + while (xfers < NUM_XFERS) { + struct iovec host_riov[2], host_wiov[2]; + struct vringh_iov riov, wiov; + u16 head, written; + + if (fast_vringh) { + for (;;) { + err = vringh_get_head(&vrh, &head); + if (err != 0) + break; + err = vringh_need_notify_user(&vrh); + if (err < 0) + errx(1, "vringh_need_notify_user: %i", + err); + if (err) { + write(to_guest[1], "", 1); + notifies++; + } + } + if (err != 1) + errx(1, "vringh_get_head"); + written = 0; + goto complete; + } else { + vringh_iov_init(&riov, + host_riov, + ARRAY_SIZE(host_riov)); + vringh_iov_init(&wiov, + host_wiov, + ARRAY_SIZE(host_wiov)); + + err = vringh_getdesc_user(&vrh, &riov, &wiov, + getrange, &head); + } + if (err == 0) { + err = vringh_need_notify_user(&vrh); + if (err < 0) + errx(1, "vringh_need_notify_user: %i", + err); + if (err) { + write(to_guest[1], "", 1); + notifies++; + } + + if (!vringh_notify_enable_user(&vrh)) + continue; + + /* Swallow all notifies at once. */ + if (read(to_host[0], buf, sizeof(buf)) < 1) + break; + + vringh_notify_disable_user(&vrh); + receives++; + continue; + } + if (err != 1) + errx(1, "vringh_getdesc_user: %i", err); + + /* We simply copy bytes. */ + if (riov.used) { + rlen = vringh_iov_pull_user(&riov, rbuf, + sizeof(rbuf)); + if (rlen != 4) + errx(1, "vringh_iov_pull_user: %i", + rlen); + assert(riov.i == riov.used); + written = 0; + } else { + err = vringh_iov_push_user(&wiov, rbuf, rlen); + if (err != rlen) + errx(1, "vringh_iov_push_user: %i", + err); + assert(wiov.i == wiov.used); + written = err; + } + complete: + xfers++; + + err = vringh_complete_user(&vrh, head, written); + if (err != 0) + errx(1, "vringh_complete_user: %i", err); + } + + err = vringh_need_notify_user(&vrh); + if (err < 0) + errx(1, "vringh_need_notify_user: %i", err); + if (err) { + write(to_guest[1], "", 1); + notifies++; + } + wait(&status); + if (!WIFEXITED(status)) + errx(1, "Child died with signal %i?", WTERMSIG(status)); + if (WEXITSTATUS(status) != 0) + errx(1, "Child exited %i?", WEXITSTATUS(status)); + printf("Host: notified %lu, pinged %lu\n", notifies, receives); + return 0; + } else { + struct guest_virtio_device gvdev; + struct virtqueue *vq; + unsigned int *data; + struct vring_desc *indirects; + unsigned int finished = 0; + + /* We pass sg[]s pointing into here, but we need RINGSIZE+1 */ + data = guest_map + vring_size(RINGSIZE, ALIGN); + indirects = (void *)data + (RINGSIZE + 1) * 2 * sizeof(int); + + /* We are the guest. */ + munmap(host_map, mapsize); + + close(to_guest[1]); + close(to_host[0]); + + gvdev.vdev.features[0] = features; + gvdev.to_host_fd = to_host[1]; + gvdev.notifies = 0; + + CPU_SET(first_cpu, &cpu_set); + if (sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set)) + err(1, "Could not set affinity to cpu %u", first_cpu); + + vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &gvdev.vdev, true, + guest_map, fast_vringh ? no_notify_host + : parallel_notify_host, + never_callback_guest, "guest vq"); + + /* Don't kfree indirects. */ + __kfree_ignore_start = indirects; + __kfree_ignore_end = indirects + RINGSIZE * 6; + + while (xfers < NUM_XFERS) { + struct scatterlist sg[4]; + unsigned int num_sg, len; + int *dbuf, err; + bool output = !(xfers % 2); + + /* Consume bufs. */ + while ((dbuf = virtqueue_get_buf(vq, &len)) != NULL) { + if (len == 4) + assert(*dbuf == finished - 1); + else if (!fast_vringh) + assert(*dbuf == finished); + finished++; + } + + /* Produce a buffer. */ + dbuf = data + (xfers % (RINGSIZE + 1)); + + if (output) + *dbuf = xfers; + else + *dbuf = -1; + + switch ((xfers / sizeof(*dbuf)) % 4) { + case 0: + /* Nasty three-element sg list. */ + sg_init_table(sg, num_sg = 3); + sg_set_buf(&sg[0], (void *)dbuf, 1); + sg_set_buf(&sg[1], (void *)dbuf + 1, 2); + sg_set_buf(&sg[2], (void *)dbuf + 3, 1); + break; + case 1: + sg_init_table(sg, num_sg = 2); + sg_set_buf(&sg[0], (void *)dbuf, 1); + sg_set_buf(&sg[1], (void *)dbuf + 1, 3); + break; + case 2: + sg_init_table(sg, num_sg = 1); + sg_set_buf(&sg[0], (void *)dbuf, 4); + break; + case 3: + sg_init_table(sg, num_sg = 4); + sg_set_buf(&sg[0], (void *)dbuf, 1); + sg_set_buf(&sg[1], (void *)dbuf + 1, 1); + sg_set_buf(&sg[2], (void *)dbuf + 2, 1); + sg_set_buf(&sg[3], (void *)dbuf + 3, 1); + break; + } + + /* May allocate an indirect, so force it to allocate + * user addr */ + __kmalloc_fake = indirects + (xfers % RINGSIZE) * 4; + if (output) + err = virtqueue_add_outbuf(vq, sg, num_sg, dbuf, + GFP_KERNEL); + else + err = virtqueue_add_inbuf(vq, sg, num_sg, + dbuf, GFP_KERNEL); + + if (err == -ENOSPC) { + if (!virtqueue_enable_cb_delayed(vq)) + continue; + /* Swallow all notifies at once. */ + if (read(to_guest[0], buf, sizeof(buf)) < 1) + break; + + receives++; + virtqueue_disable_cb(vq); + continue; + } + + if (err) + errx(1, "virtqueue_add_in/outbuf: %i", err); + + xfers++; + virtqueue_kick(vq); + } + + /* Any extra? */ + while (finished != xfers) { + int *dbuf; + unsigned int len; + + /* Consume bufs. */ + dbuf = virtqueue_get_buf(vq, &len); + if (dbuf) { + if (len == 4) + assert(*dbuf == finished - 1); + else + assert(len == 0); + finished++; + continue; + } + + if (!virtqueue_enable_cb_delayed(vq)) + continue; + if (read(to_guest[0], buf, sizeof(buf)) < 1) + break; + + receives++; + virtqueue_disable_cb(vq); + } + + printf("Guest: notified %lu, pinged %lu\n", + gvdev.notifies, receives); + vring_del_virtqueue(vq); + return 0; + } +} + +int main(int argc, char *argv[]) +{ + struct virtio_device vdev; + struct virtqueue *vq; + struct vringh vrh; + struct scatterlist guest_sg[RINGSIZE], *sgs[2]; + struct iovec host_riov[2], host_wiov[2]; + struct vringh_iov riov, wiov; + struct vring_used_elem used[RINGSIZE]; + char buf[28]; + u16 head; + int err; + unsigned i; + void *ret; + bool (*getrange)(struct vringh *vrh, u64 addr, struct vringh_range *r); + bool fast_vringh = false, parallel = false; + + getrange = getrange_iov; + vdev.features[0] = 0; + + while (argv[1]) { + if (strcmp(argv[1], "--indirect") == 0) + vdev.features[0] |= (1 << VIRTIO_RING_F_INDIRECT_DESC); + else if (strcmp(argv[1], "--eventidx") == 0) + vdev.features[0] |= (1 << VIRTIO_RING_F_EVENT_IDX); + else if (strcmp(argv[1], "--slow-range") == 0) + getrange = getrange_slow; + else if (strcmp(argv[1], "--fast-vringh") == 0) + fast_vringh = true; + else if (strcmp(argv[1], "--parallel") == 0) + parallel = true; + else + errx(1, "Unknown arg %s", argv[1]); + argv++; + } + + if (parallel) + return parallel_test(vdev.features[0], getrange, fast_vringh); + + if (posix_memalign(&__user_addr_min, PAGE_SIZE, USER_MEM) != 0) + abort(); + __user_addr_max = __user_addr_min + USER_MEM; + memset(__user_addr_min, 0, vring_size(RINGSIZE, ALIGN)); + + /* Set up guest side. */ + vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &vdev, true, + __user_addr_min, + never_notify_host, never_callback_guest, + "guest vq"); + + /* Set up host side. */ + vring_init(&vrh.vring, RINGSIZE, __user_addr_min, ALIGN); + vringh_init_user(&vrh, vdev.features[0], RINGSIZE, true, + vrh.vring.desc, vrh.vring.avail, vrh.vring.used); + + /* No descriptor to get yet... */ + err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head); + if (err != 0) + errx(1, "vringh_getdesc_user: %i", err); + + /* Guest puts in a descriptor. */ + memcpy(__user_addr_max - 1, "a", 1); + sg_init_table(guest_sg, 1); + sg_set_buf(&guest_sg[0], __user_addr_max - 1, 1); + sg_init_table(guest_sg+1, 1); + sg_set_buf(&guest_sg[1], __user_addr_max - 3, 2); + sgs[0] = &guest_sg[0]; + sgs[1] = &guest_sg[1]; + + /* May allocate an indirect, so force it to allocate user addr */ + __kmalloc_fake = __user_addr_min + vring_size(RINGSIZE, ALIGN); + err = virtqueue_add_sgs(vq, sgs, 1, 1, &err, GFP_KERNEL); + if (err) + errx(1, "virtqueue_add_sgs: %i", err); + __kmalloc_fake = NULL; + + /* Host retreives it. */ + vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov)); + vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov)); + + err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head); + if (err != 1) + errx(1, "vringh_getdesc_user: %i", err); + + assert(riov.used == 1); + assert(riov.iov[0].iov_base == __user_addr_max - 1); + assert(riov.iov[0].iov_len == 1); + if (getrange != getrange_slow) { + assert(wiov.used == 1); + assert(wiov.iov[0].iov_base == __user_addr_max - 3); + assert(wiov.iov[0].iov_len == 2); + } else { + assert(wiov.used == 2); + assert(wiov.iov[0].iov_base == __user_addr_max - 3); + assert(wiov.iov[0].iov_len == 1); + assert(wiov.iov[1].iov_base == __user_addr_max - 2); + assert(wiov.iov[1].iov_len == 1); + } + + err = vringh_iov_pull_user(&riov, buf, 5); + if (err != 1) + errx(1, "vringh_iov_pull_user: %i", err); + assert(buf[0] == 'a'); + assert(riov.i == 1); + assert(vringh_iov_pull_user(&riov, buf, 5) == 0); + + memcpy(buf, "bcdef", 5); + err = vringh_iov_push_user(&wiov, buf, 5); + if (err != 2) + errx(1, "vringh_iov_push_user: %i", err); + assert(memcmp(__user_addr_max - 3, "bc", 2) == 0); + assert(wiov.i == wiov.used); + assert(vringh_iov_push_user(&wiov, buf, 5) == 0); + + /* Host is done. */ + err = vringh_complete_user(&vrh, head, err); + if (err != 0) + errx(1, "vringh_complete_user: %i", err); + + /* Guest should see used token now. */ + __kfree_ignore_start = __user_addr_min + vring_size(RINGSIZE, ALIGN); + __kfree_ignore_end = __kfree_ignore_start + 1; + ret = virtqueue_get_buf(vq, &i); + if (ret != &err) + errx(1, "virtqueue_get_buf: %p", ret); + assert(i == 2); + + /* Guest puts in a huge descriptor. */ + sg_init_table(guest_sg, RINGSIZE); + for (i = 0; i < RINGSIZE; i++) { + sg_set_buf(&guest_sg[i], + __user_addr_max - USER_MEM/4, USER_MEM/4); + } + + /* Fill contents with recognisable garbage. */ + for (i = 0; i < USER_MEM/4; i++) + ((char *)__user_addr_max - USER_MEM/4)[i] = i; + + /* This will allocate an indirect, so force it to allocate user addr */ + __kmalloc_fake = __user_addr_min + vring_size(RINGSIZE, ALIGN); + err = virtqueue_add_outbuf(vq, guest_sg, RINGSIZE, &err, GFP_KERNEL); + if (err) + errx(1, "virtqueue_add_outbuf (large): %i", err); + __kmalloc_fake = NULL; + + /* Host picks it up (allocates new iov). */ + vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov)); + vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov)); + + err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head); + if (err != 1) + errx(1, "vringh_getdesc_user: %i", err); + + assert(riov.max_num & VRINGH_IOV_ALLOCATED); + assert(riov.iov != host_riov); + if (getrange != getrange_slow) + assert(riov.used == RINGSIZE); + else + assert(riov.used == RINGSIZE * USER_MEM/4); + + assert(!(wiov.max_num & VRINGH_IOV_ALLOCATED)); + assert(wiov.used == 0); + + /* Pull data back out (in odd chunks), should be as expected. */ + for (i = 0; i < RINGSIZE * USER_MEM/4; i += 3) { + err = vringh_iov_pull_user(&riov, buf, 3); + if (err != 3 && i + err != RINGSIZE * USER_MEM/4) + errx(1, "vringh_iov_pull_user large: %i", err); + assert(buf[0] == (char)i); + assert(err < 2 || buf[1] == (char)(i + 1)); + assert(err < 3 || buf[2] == (char)(i + 2)); + } + assert(riov.i == riov.used); + vringh_iov_cleanup(&riov); + vringh_iov_cleanup(&wiov); + + /* Complete using multi interface, just because we can. */ + used[0].id = head; + used[0].len = 0; + err = vringh_complete_multi_user(&vrh, used, 1); + if (err) + errx(1, "vringh_complete_multi_user(1): %i", err); + + /* Free up those descriptors. */ + ret = virtqueue_get_buf(vq, &i); + if (ret != &err) + errx(1, "virtqueue_get_buf: %p", ret); + + /* Add lots of descriptors. */ + sg_init_table(guest_sg, 1); + sg_set_buf(&guest_sg[0], __user_addr_max - 1, 1); + for (i = 0; i < RINGSIZE; i++) { + err = virtqueue_add_outbuf(vq, guest_sg, 1, &err, GFP_KERNEL); + if (err) + errx(1, "virtqueue_add_outbuf (multiple): %i", err); + } + + /* Now get many, and consume them all at once. */ + vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov)); + vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov)); + + for (i = 0; i < RINGSIZE; i++) { + err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head); + if (err != 1) + errx(1, "vringh_getdesc_user: %i", err); + used[i].id = head; + used[i].len = 0; + } + /* Make sure it wraps around ring, to test! */ + assert(vrh.vring.used->idx % RINGSIZE != 0); + err = vringh_complete_multi_user(&vrh, used, RINGSIZE); + if (err) + errx(1, "vringh_complete_multi_user: %i", err); + + /* Free those buffers. */ + for (i = 0; i < RINGSIZE; i++) { + unsigned len; + assert(virtqueue_get_buf(vq, &len) != NULL); + } + + /* Test weird (but legal!) indirect. */ + if (vdev.features[0] & (1 << VIRTIO_RING_F_INDIRECT_DESC)) { + char *data = __user_addr_max - USER_MEM/4; + struct vring_desc *d = __user_addr_max - USER_MEM/2; + struct vring vring; + + /* Force creation of direct, which we modify. */ + vdev.features[0] &= ~(1 << VIRTIO_RING_F_INDIRECT_DESC); + vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &vdev, true, + __user_addr_min, + never_notify_host, + never_callback_guest, + "guest vq"); + + sg_init_table(guest_sg, 4); + sg_set_buf(&guest_sg[0], d, sizeof(*d)*2); + sg_set_buf(&guest_sg[1], d + 2, sizeof(*d)*1); + sg_set_buf(&guest_sg[2], data + 6, 4); + sg_set_buf(&guest_sg[3], d + 3, sizeof(*d)*3); + + err = virtqueue_add_outbuf(vq, guest_sg, 4, &err, GFP_KERNEL); + if (err) + errx(1, "virtqueue_add_outbuf (indirect): %i", err); + + vring_init(&vring, RINGSIZE, __user_addr_min, ALIGN); + + /* They're used in order, but double-check... */ + assert(vring.desc[0].addr == (unsigned long)d); + assert(vring.desc[1].addr == (unsigned long)(d+2)); + assert(vring.desc[2].addr == (unsigned long)data + 6); + assert(vring.desc[3].addr == (unsigned long)(d+3)); + vring.desc[0].flags |= VRING_DESC_F_INDIRECT; + vring.desc[1].flags |= VRING_DESC_F_INDIRECT; + vring.desc[3].flags |= VRING_DESC_F_INDIRECT; + + /* First indirect */ + d[0].addr = (unsigned long)data; + d[0].len = 1; + d[0].flags = VRING_DESC_F_NEXT; + d[0].next = 1; + d[1].addr = (unsigned long)data + 1; + d[1].len = 2; + d[1].flags = 0; + + /* Second indirect */ + d[2].addr = (unsigned long)data + 3; + d[2].len = 3; + d[2].flags = 0; + + /* Third indirect */ + d[3].addr = (unsigned long)data + 10; + d[3].len = 5; + d[3].flags = VRING_DESC_F_NEXT; + d[3].next = 1; + d[4].addr = (unsigned long)data + 15; + d[4].len = 6; + d[4].flags = VRING_DESC_F_NEXT; + d[4].next = 2; + d[5].addr = (unsigned long)data + 21; + d[5].len = 7; + d[5].flags = 0; + + /* Host picks it up (allocates new iov). */ + vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov)); + vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov)); + + err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head); + if (err != 1) + errx(1, "vringh_getdesc_user: %i", err); + + if (head != 0) + errx(1, "vringh_getdesc_user: head %i not 0", head); + + assert(riov.max_num & VRINGH_IOV_ALLOCATED); + if (getrange != getrange_slow) + assert(riov.used == 7); + else + assert(riov.used == 28); + err = vringh_iov_pull_user(&riov, buf, 29); + assert(err == 28); + + /* Data should be linear. */ + for (i = 0; i < err; i++) + assert(buf[i] == i); + vringh_iov_cleanup(&riov); + } + + /* Don't leak memory... */ + vring_del_virtqueue(vq); + free(__user_addr_min); + + return 0; +} |