summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorAlexei Starovoitov <ast@kernel.org>2018-01-26 03:41:15 +0300
committerAlexei Starovoitov <ast@kernel.org>2018-01-26 03:41:16 +0300
commit82f1e0f3acf04dfd2de95f7ffb824528ecbf0a95 (patch)
treea740942fab1902605fe48ed877d942c6ad02ad58 /include
parente9dcd80b9d77a92bfae6ce42a451f5c5fd318832 (diff)
parentd6d4f60c3a0933852dcc40a2142d93027ea1da76 (diff)
downloadlinux-82f1e0f3acf04dfd2de95f7ffb824528ecbf0a95.tar.xz
Merge branch 'bpf-more-sock_ops-callbacks'
Lawrence Brakmo says: ==================== This patchset adds support for: - direct R or R/W access to many tcp_sock fields - passing up to 4 arguments to sock_ops BPF functions - tcp_sock field bpf_sock_ops_cb_flags for controlling callbacks - optionally calling sock_ops BPF program when RTO fires - optionally calling sock_ops BPF program when packet is retransmitted - optionally calling sock_ops BPF program when TCP state changes - access to tclass and sk_txhash - new selftest v2: Fixed commit message 0/11. The commit is to "bpf-next" but the patch below used "bpf" and Patchwork didn't work correctly. v3: Cleaned RTO callback as per Yuchung's comment Added BPF enum for TCP states as per Alexei's comment v4: Fixed compile warnings related to detecting changes between TCP internal states and the BPF defined states. v5: Fixed comment issues in some selftest files Fixed accesss issue with u64 fields in bpf_sock_ops struct v6: Made fixes based on comments form Eric Dumazet: The field bpf_sock_ops_cb_flags was addded in a hole on 64bit kernels Field bpf_sock_ops_cb_flags is now set through a helper function which returns an error when a BPF program tries to set bits for callbacks that are not supported in the current kernel. Added a comment indicating that when adding fields to bpf_sock_ops_kern they should be added before the field named "temp" if they need to be cleared before calling the BPF function. v7: Enfornced fields "op" and "replylong[1] .. replylong[3]" not be writable based on comments form Eric Dumazet and Alexei Starovoitov. Filled 32 bit hole in bpf_sock_ops struct with sk_txhash based on comments from Daniel Borkmann. Removed unused functions (tcp_call_bpf_1arg, tcp_call_bpf_4arg) based on comments from Daniel Borkmann. v8: Add commit message 00/12 Add Acked-by as appropriate v9: Moved the bug fix to the front of the patchset Changed RETRANS_CB so it is always called (before it was only called if the retransmit succeeded). It is now called with an extra argument, the return value of tcp_transmit_skb (0 => success). Based on comments from Yuchung Cheng. Added support for reading 2 new fields, sacked_out and lost_out, based on comments from Yuchung Cheng. v10: Moved the callback flags from include/uapi/linux/tcp.h to include/uapi/linux/bpf.h Cleaned up the test in selftest. Added a timeout so it always completes, even if the client is not communicating with the server. Made it faster by removing the sleeps. Made sure it works even when called back-to-back 20 times. Consists of the following patches: [PATCH bpf-next v10 01/12] bpf: Only reply field should be writeable [PATCH bpf-next v10 02/12] bpf: Make SOCK_OPS_GET_TCP size [PATCH bpf-next v10 03/12] bpf: Make SOCK_OPS_GET_TCP struct [PATCH bpf-next v10 04/12] bpf: Add write access to tcp_sock and sock [PATCH bpf-next v10 05/12] bpf: Support passing args to sock_ops bpf [PATCH bpf-next v10 06/12] bpf: Adds field bpf_sock_ops_cb_flags to [PATCH bpf-next v10 07/12] bpf: Add sock_ops RTO callback [PATCH bpf-next v10 08/12] bpf: Add support for reading sk_state and [PATCH bpf-next v10 09/12] bpf: Add sock_ops R/W access to tclass [PATCH bpf-next v10 10/12] bpf: Add BPF_SOCK_OPS_RETRANS_CB [PATCH bpf-next v10 11/12] bpf: Add BPF_SOCK_OPS_STATE_CB [PATCH bpf-next v10 12/12] bpf: add selftest for tcpbpf ==================== Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Diffstat (limited to 'include')
-rw-r--r--include/linux/filter.h10
-rw-r--r--include/linux/tcp.h11
-rw-r--r--include/net/tcp.h42
-rw-r--r--include/uapi/linux/bpf.h84
4 files changed, 138 insertions, 9 deletions
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 425056c7f96c..20384c4bed25 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1003,10 +1003,20 @@ struct bpf_sock_ops_kern {
struct sock *sk;
u32 op;
union {
+ u32 args[4];
u32 reply;
u32 replylong[4];
};
u32 is_fullsock;
+ u64 temp; /* temp and everything after is not
+ * initialized to 0 before calling
+ * the BPF program. New fields that
+ * should be initialized to 0 should
+ * be inserted before temp.
+ * temp is scratch storage used by
+ * sock_ops_convert_ctx_access
+ * as temporary storage of a register.
+ */
};
#endif /* __LINUX_FILTER_H__ */
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 4f93f0953c41..8f4c54986f97 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -335,6 +335,17 @@ struct tcp_sock {
int linger2;
+
+/* Sock_ops bpf program related variables */
+#ifdef CONFIG_BPF
+ u8 bpf_sock_ops_cb_flags; /* Control calling BPF programs
+ * values defined in uapi/linux/tcp.h
+ */
+#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG)
+#else
+#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0
+#endif
+
/* Receiver side RTT estimation */
struct {
u32 rtt_us;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5a1d26a18599..093e967a2960 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2006,12 +2006,12 @@ void tcp_cleanup_ulp(struct sock *sk);
* program loaded).
*/
#ifdef CONFIG_BPF
-static inline int tcp_call_bpf(struct sock *sk, int op)
+static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
{
struct bpf_sock_ops_kern sock_ops;
int ret;
- memset(&sock_ops, 0, sizeof(sock_ops));
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
if (sk_fullsock(sk)) {
sock_ops.is_fullsock = 1;
sock_owned_by_me(sk);
@@ -2019,6 +2019,8 @@ static inline int tcp_call_bpf(struct sock *sk, int op)
sock_ops.sk = sk;
sock_ops.op = op;
+ if (nargs > 0)
+ memcpy(sock_ops.args, args, nargs * sizeof(*args));
ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
if (ret == 0)
@@ -2027,18 +2029,46 @@ static inline int tcp_call_bpf(struct sock *sk, int op)
ret = -1;
return ret;
}
+
+static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2)
+{
+ u32 args[2] = {arg1, arg2};
+
+ return tcp_call_bpf(sk, op, 2, args);
+}
+
+static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2,
+ u32 arg3)
+{
+ u32 args[3] = {arg1, arg2, arg3};
+
+ return tcp_call_bpf(sk, op, 3, args);
+}
+
#else
-static inline int tcp_call_bpf(struct sock *sk, int op)
+static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
{
return -EPERM;
}
+
+static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2)
+{
+ return -EPERM;
+}
+
+static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2,
+ u32 arg3)
+{
+ return -EPERM;
+}
+
#endif
static inline u32 tcp_timeout_init(struct sock *sk)
{
int timeout;
- timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT);
+ timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT, 0, NULL);
if (timeout <= 0)
timeout = TCP_TIMEOUT_INIT;
@@ -2049,7 +2079,7 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
{
int rwnd;
- rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT);
+ rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT, 0, NULL);
if (rwnd < 0)
rwnd = 0;
@@ -2058,7 +2088,7 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
{
- return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1);
+ return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1);
}
#if IS_ENABLED(CONFIG_SMC)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 406c19d6016b..db6bdc375126 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -642,6 +642,14 @@ union bpf_attr {
* @optlen: length of optval in bytes
* Return: 0 or negative error
*
+ * int bpf_sock_ops_cb_flags_set(bpf_sock_ops, flags)
+ * Set callback flags for sock_ops
+ * @bpf_sock_ops: pointer to bpf_sock_ops_kern struct
+ * @flags: flags value
+ * Return: 0 for no error
+ * -EINVAL if there is no full tcp socket
+ * bits in flags that are not supported by current kernel
+ *
* int bpf_skb_adjust_room(skb, len_diff, mode, flags)
* Grow or shrink room in sk_buff.
* @skb: pointer to skb
@@ -748,7 +756,8 @@ union bpf_attr {
FN(perf_event_read_value), \
FN(perf_prog_read_value), \
FN(getsockopt), \
- FN(override_return),
+ FN(override_return), \
+ FN(sock_ops_cb_flags_set),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
@@ -952,8 +961,9 @@ struct bpf_map_info {
struct bpf_sock_ops {
__u32 op;
union {
- __u32 reply;
- __u32 replylong[4];
+ __u32 args[4]; /* Optionally passed to bpf program */
+ __u32 reply; /* Returned by bpf program */
+ __u32 replylong[4]; /* Optionally returned by bpf prog */
};
__u32 family;
__u32 remote_ip4; /* Stored in network byte order */
@@ -968,8 +978,39 @@ struct bpf_sock_ops {
*/
__u32 snd_cwnd;
__u32 srtt_us; /* Averaged RTT << 3 in usecs */
+ __u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */
+ __u32 state;
+ __u32 rtt_min;
+ __u32 snd_ssthresh;
+ __u32 rcv_nxt;
+ __u32 snd_nxt;
+ __u32 snd_una;
+ __u32 mss_cache;
+ __u32 ecn_flags;
+ __u32 rate_delivered;
+ __u32 rate_interval_us;
+ __u32 packets_out;
+ __u32 retrans_out;
+ __u32 total_retrans;
+ __u32 segs_in;
+ __u32 data_segs_in;
+ __u32 segs_out;
+ __u32 data_segs_out;
+ __u32 lost_out;
+ __u32 sacked_out;
+ __u32 sk_txhash;
+ __u64 bytes_received;
+ __u64 bytes_acked;
};
+/* Definitions for bpf_sock_ops_cb_flags */
+#define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0)
+#define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1)
+#define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2)
+#define BPF_SOCK_OPS_ALL_CB_FLAGS 0x7 /* Mask of all currently
+ * supported cb flags
+ */
+
/* List of known BPF sock_ops operators.
* New entries can only be added at the end
*/
@@ -1003,6 +1044,43 @@ enum {
* a congestion threshold. RTTs above
* this indicate congestion
*/
+ BPF_SOCK_OPS_RTO_CB, /* Called when an RTO has triggered.
+ * Arg1: value of icsk_retransmits
+ * Arg2: value of icsk_rto
+ * Arg3: whether RTO has expired
+ */
+ BPF_SOCK_OPS_RETRANS_CB, /* Called when skb is retransmitted.
+ * Arg1: sequence number of 1st byte
+ * Arg2: # segments
+ * Arg3: return value of
+ * tcp_transmit_skb (0 => success)
+ */
+ BPF_SOCK_OPS_STATE_CB, /* Called when TCP changes state.
+ * Arg1: old_state
+ * Arg2: new_state
+ */
+};
+
+/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
+ * changes between the TCP and BPF versions. Ideally this should never happen.
+ * If it does, we need to add code to convert them before calling
+ * the BPF sock_ops function.
+ */
+enum {
+ BPF_TCP_ESTABLISHED = 1,
+ BPF_TCP_SYN_SENT,
+ BPF_TCP_SYN_RECV,
+ BPF_TCP_FIN_WAIT1,
+ BPF_TCP_FIN_WAIT2,
+ BPF_TCP_TIME_WAIT,
+ BPF_TCP_CLOSE,
+ BPF_TCP_CLOSE_WAIT,
+ BPF_TCP_LAST_ACK,
+ BPF_TCP_LISTEN,
+ BPF_TCP_CLOSING, /* Now a valid state */
+ BPF_TCP_NEW_SYN_RECV,
+
+ BPF_TCP_MAX_STATES /* Leave at the end! */
};
#define TCP_BPF_IW 1001 /* Set TCP initial congestion window */