tools/testing/selftests/bpf/prog_tests/lwt_reroute.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263

// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause

/*
 * Test suite of lwt BPF programs that reroutes packets
 *   The file tests focus not only if these programs work as expected normally,
 *   but also if they can handle abnormal situations gracefully. This test
 *   suite currently only covers lwt_xmit hook. lwt_in tests have not been
 *   implemented.
 *
 * WARNING
 * -------
 *  This test suite can crash the kernel, thus should be run in a VM.
 *
 * Setup:
 * ---------
 *  all tests are performed in a single netns. A lwt encap route is setup for
 *  each subtest:
 *
 *    ip route add 10.0.0.0/24 encap bpf xmit <obj> sec "<section_N>" dev link_err
 *
 *  Here <obj> is statically defined to test_lwt_reroute.bpf.o, and it contains
 *  a single test program entry. This program sets packet mark by last byte of
 *  the IPv4 daddr. For example, a packet going to 1.2.3.4 will receive a skb
 *  mark 4. A packet will only be marked once, and IP x.x.x.0 will be skipped
 *  to avoid route loop. We didn't use generated BPF skeleton since the
 *  attachment for lwt programs are not supported by libbpf yet.
 *
 *  The test program will bring up a tun device, and sets up the following
 *  routes:
 *
 *    ip rule add pref 100 from all fwmark <tun_index> lookup 100
 *    ip route add table 100 default dev tun0
 *
 *  For normal testing, a ping command is running in the test netns:
 *
 *    ping 10.0.0.<tun_index> -c 1 -w 1 -s 100
 *
 *  For abnormal testing, fq is used as the qdisc of the tun device. Then a UDP
 *  socket will try to overflow the fq queue and trigger qdisc drop error.
 *
 * Scenarios:
 * --------------------------------
 *  1. Reroute to a running tun device
 *  2. Reroute to a device where qdisc drop
 *
 *  For case 1, ping packets should be received by the tun device.
 *
 *  For case 2, force UDP packets to overflow fq limit. As long as kernel
 *  is not crashed, it is considered successful.
 */
#define NETNS "ns_lwt_reroute"
#include "lwt_helpers.h"
#include "network_helpers.h"
#include <linux/net_tstamp.h>

#define BPF_OBJECT            "test_lwt_reroute.bpf.o"
#define LOCAL_SRC             "10.0.0.1"
#define TEST_CIDR             "10.0.0.0/24"
#define XMIT_HOOK             "xmit"
#define XMIT_SECTION          "lwt_xmit"
#define NSEC_PER_SEC          1000000000ULL

/* send a ping to be rerouted to the target device */
static void ping_once(const char *ip)
{
	/* We won't get a reply. Don't fail here */
	SYS_NOFAIL("ping %s -c1 -W1 -s %d",
		   ip, ICMP_PAYLOAD_SIZE);
}

/* Send snd_target UDP packets to overflow the fq queue and trigger qdisc drop
 * error. This is done via TX tstamp to force buffering delayed packets.
 */
static int overflow_fq(int snd_target, const char *target_ip)
{
	struct sockaddr_in addr = {
		.sin_family = AF_INET,
		.sin_port = htons(1234),
	};

	char data_buf[8]; /* only #pkts matter, so use a random small buffer */
	char control_buf[CMSG_SPACE(sizeof(uint64_t))];
	struct iovec iov = {
		.iov_base = data_buf,
		.iov_len = sizeof(data_buf),
	};
	int err = -1;
	int s = -1;
	struct sock_txtime txtime_on = {
		.clockid = CLOCK_MONOTONIC,
		.flags = 0,
	};
	struct msghdr msg = {
		.msg_name = &addr,
		.msg_namelen = sizeof(addr),
		.msg_control = control_buf,
		.msg_controllen = sizeof(control_buf),
		.msg_iovlen = 1,
		.msg_iov = &iov,
	};
	struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);

	memset(data_buf, 0, sizeof(data_buf));

	s = socket(AF_INET, SOCK_DGRAM, 0);
	if (!ASSERT_GE(s, 0, "socket"))
		goto out;

	err = setsockopt(s, SOL_SOCKET, SO_TXTIME, &txtime_on, sizeof(txtime_on));
	if (!ASSERT_OK(err, "setsockopt(SO_TXTIME)"))
		goto out;

	err = inet_pton(AF_INET, target_ip, &addr.sin_addr);
	if (!ASSERT_EQ(err, 1, "inet_pton"))
		goto out;

	while (snd_target > 0) {
		struct timespec now;

		memset(control_buf, 0, sizeof(control_buf));
		cmsg->cmsg_type = SCM_TXTIME;
		cmsg->cmsg_level = SOL_SOCKET;
		cmsg->cmsg_len = CMSG_LEN(sizeof(uint64_t));

		err = clock_gettime(CLOCK_MONOTONIC, &now);
		if (!ASSERT_OK(err, "clock_gettime(CLOCK_MONOTONIC)")) {
			err = -1;
			goto out;
		}

		*(uint64_t *)CMSG_DATA(cmsg) = (now.tv_nsec + 1) * NSEC_PER_SEC +
					       now.tv_nsec;

		/* we will intentionally send more than fq limit, so ignore
		 * the error here.
		 */
		sendmsg(s, &msg, MSG_NOSIGNAL);
		snd_target--;
	}

	/* no kernel crash so far is considered success */
	err = 0;

out:
	if (s >= 0)
		close(s);

	return err;
}

static int setup(const char *tun_dev)
{
	int target_index = -1;
	int tap_fd = -1;

	tap_fd = open_tuntap(tun_dev, false);
	if (!ASSERT_GE(tap_fd, 0, "open_tun"))
		return -1;

	target_index = if_nametoindex(tun_dev);
	if (!ASSERT_GE(target_index, 0, "if_nametoindex"))
		return -1;

	SYS(fail, "ip link add link_err type dummy");
	SYS(fail, "ip link set lo up");
	SYS(fail, "ip addr add dev lo " LOCAL_SRC "/32");
	SYS(fail, "ip link set link_err up");
	SYS(fail, "ip link set %s up", tun_dev);

	SYS(fail, "ip route add %s dev link_err encap bpf xmit obj %s sec lwt_xmit",
	    TEST_CIDR, BPF_OBJECT);

	SYS(fail, "ip rule add pref 100 from all fwmark %d lookup 100",
	    target_index);
	SYS(fail, "ip route add t 100 default dev %s", tun_dev);

	return tap_fd;

fail:
	if (tap_fd >= 0)
		close(tap_fd);
	return -1;
}

static void test_lwt_reroute_normal_xmit(void)
{
	const char *tun_dev = "tun0";
	int tun_fd = -1;
	int ifindex = -1;
	char ip[256];
	struct timeval timeo = {
		.tv_sec = 0,
		.tv_usec = 250000,
	};

	tun_fd = setup(tun_dev);
	if (!ASSERT_GE(tun_fd, 0, "setup_reroute"))
		return;

	ifindex = if_nametoindex(tun_dev);
	if (!ASSERT_GE(ifindex, 0, "if_nametoindex"))
		return;

	snprintf(ip, 256, "10.0.0.%d", ifindex);

	/* ping packets should be received by the tun device */
	ping_once(ip);

	if (!ASSERT_EQ(wait_for_packet(tun_fd, __expect_icmp_ipv4, &timeo), 1,
		       "wait_for_packet"))
		log_err("%s xmit", __func__);
}

/*
 * Test the failure case when the skb is dropped at the qdisc. This is a
 * regression prevention at the xmit hook only.
 */
static void test_lwt_reroute_qdisc_dropped(void)
{
	const char *tun_dev = "tun0";
	int tun_fd = -1;
	int ifindex = -1;
	char ip[256];

	tun_fd = setup(tun_dev);
	if (!ASSERT_GE(tun_fd, 0, "setup_reroute"))
		goto fail;

	SYS(fail, "tc qdisc replace dev %s root fq limit 5 flow_limit 5", tun_dev);

	ifindex = if_nametoindex(tun_dev);
	if (!ASSERT_GE(ifindex, 0, "if_nametoindex"))
		return;

	snprintf(ip, 256, "10.0.0.%d", ifindex);
	ASSERT_EQ(overflow_fq(10, ip), 0, "overflow_fq");

fail:
	if (tun_fd >= 0)
		close(tun_fd);
}

static void *test_lwt_reroute_run(void *arg)
{
	netns_delete();
	RUN_TEST(lwt_reroute_normal_xmit);
	RUN_TEST(lwt_reroute_qdisc_dropped);
	return NULL;
}

void test_lwt_reroute(void)
{
	pthread_t test_thread;
	int err;

	/* Run the tests in their own thread to isolate the namespace changes
	 * so they do not affect the environment of other tests.
	 * (specifically needed because of unshare(CLONE_NEWNS) in open_netns())
	 */
	err = pthread_create(&test_thread, NULL, &test_lwt_reroute_run, NULL);
	if (ASSERT_OK(err, "pthread_create"))
		ASSERT_OK(pthread_join(test_thread, NULL), "pthread_join");
}