summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2017-10-18 14:12:19 +0300
committerDavid S. Miller <davem@davemloft.net>2017-10-18 14:12:19 +0300
commit452606d6c9cd7cd6d1037d06763c687f617e795f (patch)
tree426663736699bf8fb062846ca34e601413865732 /include
parent4b70c62b9eafcee0505b440732d2e00c50f3085d (diff)
parentfad3917e361b115f776563366415ffb2fc706bf1 (diff)
downloadlinux-452606d6c9cd7cd6d1037d06763c687f617e795f.tar.xz
Merge branch 'bpf-cpumap-type-for-XDP_REDIRECT'
Jesper Dangaard Brouer says: ==================== net: New bpf cpumap type for XDP_REDIRECT Introducing a new way to redirect XDP frames. Notice how no driver changes are necessary given the design of XDP_REDIRECT. This redirect map type is called 'cpumap', as it allows redirection XDP frames to remote CPUs. The remote CPU will do the SKB allocation and start the network stack invocation on that CPU. This is a scalability and isolation mechanism, that allow separating the early driver network XDP layer, from the rest of the netstack, and assigning dedicated CPUs for this stage. The sysadm control/configure the RX-CPU to NIC-RX queue (as usual) via procfs smp_affinity and how many queues are configured via ethtool --set-channels. Benchmarks show that a single CPU can handle approx 11Mpps. Thus, only assigning two NIC RX-queues (and two CPUs) is sufficient for handling 10Gbit/s wirespeed smallest packet 14.88Mpps. Reducing the number of queues have the advantage that more packets being "bulk" available per hard interrupt[1]. [1] https://www.netdevconf.org/2.1/papers/BusyPollingNextGen.pdf Use-cases: 1. End-host based pre-filtering for DDoS mitigation. This is fast enough to allow software to see and filter all packets wirespeed. Thus, no packets getting silently dropped by hardware. 2. Given NIC HW unevenly distributes packets across RX queue, this mechanism can be used for redistribution load across CPUs. This usually happens when HW is unaware of a new protocol. This resembles RPS (Receive Packet Steering), just faster, but with more responsibility placed on the BPF program for correct steering. 3. Auto-scaling or power saving via only activating the appropriate number of remote CPUs for handling the current load. The cpumap tracepoints can function as a feedback loop for this purpose. In V7, a --stress-mode was implemented for the samples program, which between each stats update, adds + removes CPUs from the map concurrently with traffic. I did find and fix some concurrency issues in the tear-down path, details in patch desc. The stress test have now been running for 15 hours without any issues, while being bombarded with 11.6 Mpps via pktgen_sample04_many_flows.sh. See individual patches for patchset-version changes. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include')
-rw-r--r--include/linux/bpf.h31
-rw-r--r--include/linux/bpf_types.h1
-rw-r--r--include/linux/netdevice.h1
-rw-r--r--include/trace/events/xdp.h80
-rw-r--r--include/uapi/linux/bpf.h1
5 files changed, 111 insertions, 3 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4373125de1f3..6d4dd844828a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -355,6 +355,13 @@ struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
void __dev_map_flush(struct bpf_map *map);
+struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
+void __cpu_map_insert_ctx(struct bpf_map *map, u32 index);
+void __cpu_map_flush(struct bpf_map *map);
+struct xdp_buff;
+int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
+ struct net_device *dev_rx);
+
/* Return map's numa specified by userspace */
static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
{
@@ -362,7 +369,7 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
attr->numa_node : NUMA_NO_NODE;
}
-#else
+#else /* !CONFIG_BPF_SYSCALL */
static inline struct bpf_prog *bpf_prog_get(u32 ufd)
{
return ERR_PTR(-EOPNOTSUPP);
@@ -425,6 +432,28 @@ static inline void __dev_map_insert_ctx(struct bpf_map *map, u32 index)
static inline void __dev_map_flush(struct bpf_map *map)
{
}
+
+static inline
+struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+ return NULL;
+}
+
+static inline void __cpu_map_insert_ctx(struct bpf_map *map, u32 index)
+{
+}
+
+static inline void __cpu_map_flush(struct bpf_map *map)
+{
+}
+
+struct xdp_buff;
+static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
+ struct xdp_buff *xdp,
+ struct net_device *dev_rx)
+{
+ return 0;
+}
#endif /* CONFIG_BPF_SYSCALL */
#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL)
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 6f1a567667b8..814c1081a4a9 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -41,4 +41,5 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
#ifdef CONFIG_STREAM_PARSER
BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
#endif
+BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
#endif
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 31bb3010c69b..bf014afcb914 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3260,6 +3260,7 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb);
int netif_rx(struct sk_buff *skb);
int netif_rx_ni(struct sk_buff *skb);
int netif_receive_skb(struct sk_buff *skb);
+int netif_receive_skb_core(struct sk_buff *skb);
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
void napi_gro_flush(struct napi_struct *napi, bool flush_old);
struct sk_buff *napi_get_frags(struct napi_struct *napi);
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 4e16c43fba10..0c8dec61987e 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -136,14 +136,90 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err,
__entry->map_id, __entry->map_index)
);
+#define devmap_ifindex(fwd, map) \
+ (!fwd ? 0 : \
+ (!map ? 0 : \
+ ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \
+ ((struct net_device *)fwd)->ifindex : 0)))
+
#define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \
- trace_xdp_redirect_map(dev, xdp, fwd ? fwd->ifindex : 0, \
+ trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map), \
0, map, idx)
#define _trace_xdp_redirect_map_err(dev, xdp, fwd, map, idx, err) \
- trace_xdp_redirect_map_err(dev, xdp, fwd ? fwd->ifindex : 0, \
+ trace_xdp_redirect_map_err(dev, xdp, devmap_ifindex(fwd, map), \
err, map, idx)
+TRACE_EVENT(xdp_cpumap_kthread,
+
+ TP_PROTO(int map_id, unsigned int processed, unsigned int drops,
+ int sched),
+
+ TP_ARGS(map_id, processed, drops, sched),
+
+ TP_STRUCT__entry(
+ __field(int, map_id)
+ __field(u32, act)
+ __field(int, cpu)
+ __field(unsigned int, drops)
+ __field(unsigned int, processed)
+ __field(int, sched)
+ ),
+
+ TP_fast_assign(
+ __entry->map_id = map_id;
+ __entry->act = XDP_REDIRECT;
+ __entry->cpu = smp_processor_id();
+ __entry->drops = drops;
+ __entry->processed = processed;
+ __entry->sched = sched;
+ ),
+
+ TP_printk("kthread"
+ " cpu=%d map_id=%d action=%s"
+ " processed=%u drops=%u"
+ " sched=%d",
+ __entry->cpu, __entry->map_id,
+ __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
+ __entry->processed, __entry->drops,
+ __entry->sched)
+);
+
+TRACE_EVENT(xdp_cpumap_enqueue,
+
+ TP_PROTO(int map_id, unsigned int processed, unsigned int drops,
+ int to_cpu),
+
+ TP_ARGS(map_id, processed, drops, to_cpu),
+
+ TP_STRUCT__entry(
+ __field(int, map_id)
+ __field(u32, act)
+ __field(int, cpu)
+ __field(unsigned int, drops)
+ __field(unsigned int, processed)
+ __field(int, to_cpu)
+ ),
+
+ TP_fast_assign(
+ __entry->map_id = map_id;
+ __entry->act = XDP_REDIRECT;
+ __entry->cpu = smp_processor_id();
+ __entry->drops = drops;
+ __entry->processed = processed;
+ __entry->to_cpu = to_cpu;
+ ),
+
+ TP_printk("enqueue"
+ " cpu=%d map_id=%d action=%s"
+ " processed=%u drops=%u"
+ " to_cpu=%d",
+ __entry->cpu, __entry->map_id,
+ __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
+ __entry->processed, __entry->drops,
+ __entry->to_cpu)
+);
+
#endif /* _TRACE_XDP_H */
#include <trace/define_trace.h>
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6db9e1d679cd..4303fb6c3817 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -112,6 +112,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_HASH_OF_MAPS,
BPF_MAP_TYPE_DEVMAP,
BPF_MAP_TYPE_SOCKMAP,
+ BPF_MAP_TYPE_CPUMAP,
};
enum bpf_prog_type {