diff options
67 files changed, 741 insertions, 252 deletions
diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index 069c42f22a8c..5bb144435c16 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -110,7 +110,7 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; case BPF_FUNC_trace_printk: - if (capable(CAP_SYS_ADMIN)) + if (perfmon_capable()) return bpf_get_trace_printk_proto(); /* fall through */ default: diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 2818965427e9..85b87ed02dd5 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -1606,6 +1606,7 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi, "%s qid %d\n", __func__, rx_ring->qid); res_budget = budget; xdp.rxq = &rx_ring->xdp_rxq; + xdp.frame_sz = ENA_PAGE_SIZE; do { xdp_verdict = XDP_PASS; diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.h b/drivers/net/ethernet/amazon/ena/ena_netdev.h index 7df67bf09b93..680099afcccf 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.h +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.h @@ -151,8 +151,9 @@ * The buffer size we share with the device is defined to be ENA_PAGE_SIZE */ -#define ENA_XDP_MAX_MTU (ENA_PAGE_SIZE - ETH_HLEN - ETH_FCS_LEN - \ - VLAN_HLEN - XDP_PACKET_HEADROOM) +#define ENA_XDP_MAX_MTU (ENA_PAGE_SIZE - ETH_HLEN - ETH_FCS_LEN - \ + VLAN_HLEN - XDP_PACKET_HEADROOM - \ + SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) #define ENA_IS_XDP_INDEX(adapter, index) (((index) >= (adapter)->xdp_first_ring) && \ ((index) < (adapter)->xdp_first_ring + (adapter)->xdp_num_queues)) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index c6f6f2033880..5e3b4a3b69ea 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -138,6 +138,7 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons, xdp_set_data_meta_invalid(&xdp); xdp.data_end = *data_ptr + *len; xdp.rxq = &rxr->xdp_rxq; + xdp.frame_sz = PAGE_SIZE; /* BNXT_RX_PAGE_MODE(bp) when XDP enabled */ orig_data = xdp.data; rcu_read_lock(); diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index b4b33368698f..2ba0ce115e63 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -552,6 +552,7 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog, xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + len; xdp.rxq = &rq->xdp_rxq; + xdp.frame_sz = RCV_FRAG_LEN + XDP_PACKET_HEADROOM; orig_data = xdp.data; rcu_read_lock(); diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 0f3e842a4fd6..8c8d95aa1dfd 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -331,6 +331,9 @@ static u32 run_xdp(struct dpaa2_eth_priv *priv, xdp_set_data_meta_invalid(&xdp); xdp.rxq = &ch->xdp_rxq; + xdp.frame_sz = DPAA2_ETH_RX_BUF_RAW_SIZE - + (dpaa2_fd_get_offset(fd) - XDP_PACKET_HEADROOM); + xdp_act = bpf_prog_run_xdp(xdp_prog, &xdp); /* xdp.data pointer may have changed */ @@ -366,7 +369,11 @@ static u32 run_xdp(struct dpaa2_eth_priv *priv, dma_unmap_page(priv->net_dev->dev.parent, addr, DPAA2_ETH_RX_BUF_SIZE, DMA_BIDIRECTIONAL); ch->buf_count--; + + /* Allow redirect use of full headroom */ xdp.data_hard_start = vaddr; + xdp.frame_sz = DPAA2_ETH_RX_BUF_RAW_SIZE; + err = xdp_do_redirect(priv->net_dev, &xdp, xdp_prog); if (unlikely(err)) ch->stats.xdp_drop++; diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index b8496037ef7f..a3772beffe02 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -1507,6 +1507,22 @@ static inline unsigned int i40e_rx_offset(struct i40e_ring *rx_ring) return ring_uses_build_skb(rx_ring) ? I40E_SKB_PAD : 0; } +static unsigned int i40e_rx_frame_truesize(struct i40e_ring *rx_ring, + unsigned int size) +{ + unsigned int truesize; + +#if (PAGE_SIZE < 8192) + truesize = i40e_rx_pg_size(rx_ring) / 2; /* Must be power-of-2 */ +#else + truesize = i40e_rx_offset(rx_ring) ? + SKB_DATA_ALIGN(size + i40e_rx_offset(rx_ring)) + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) : + SKB_DATA_ALIGN(size); +#endif + return truesize; +} + /** * i40e_alloc_mapped_page - recycle or make a new page * @rx_ring: ring to use @@ -2246,13 +2262,11 @@ static void i40e_rx_buffer_flip(struct i40e_ring *rx_ring, struct i40e_rx_buffer *rx_buffer, unsigned int size) { -#if (PAGE_SIZE < 8192) - unsigned int truesize = i40e_rx_pg_size(rx_ring) / 2; + unsigned int truesize = i40e_rx_frame_truesize(rx_ring, size); +#if (PAGE_SIZE < 8192) rx_buffer->page_offset ^= truesize; #else - unsigned int truesize = SKB_DATA_ALIGN(i40e_rx_offset(rx_ring) + size); - rx_buffer->page_offset += truesize; #endif } @@ -2335,6 +2349,9 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) bool failure = false; struct xdp_buff xdp; +#if (PAGE_SIZE < 8192) + xdp.frame_sz = i40e_rx_frame_truesize(rx_ring, 0); +#endif xdp.rxq = &rx_ring->xdp_rxq; while (likely(total_rx_packets < (unsigned int)budget)) { @@ -2389,7 +2406,10 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) xdp.data_hard_start = xdp.data - i40e_rx_offset(rx_ring); xdp.data_end = xdp.data + size; - +#if (PAGE_SIZE > 4096) + /* At larger PAGE_SIZE, frame_sz depend on len size */ + xdp.frame_sz = i40e_rx_frame_truesize(rx_ring, size); +#endif skb = i40e_run_xdp(rx_ring, &xdp); } diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 0b7d29192b2c..2b9184aead5f 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -531,12 +531,14 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget) { unsigned int total_rx_bytes = 0, total_rx_packets = 0; u16 cleaned_count = I40E_DESC_UNUSED(rx_ring); + struct xdp_umem *umem = rx_ring->xsk_umem; unsigned int xdp_res, xdp_xmit = 0; bool failure = false; struct sk_buff *skb; struct xdp_buff xdp; xdp.rxq = &rx_ring->xdp_rxq; + xdp.frame_sz = xsk_umem_xdp_frame_sz(umem); while (likely(total_rx_packets < (unsigned int)budget)) { struct i40e_rx_buffer *bi; diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index f67e8362958c..69b21b436f9a 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -423,6 +423,22 @@ static unsigned int ice_rx_offset(struct ice_ring *rx_ring) return 0; } +static unsigned int ice_rx_frame_truesize(struct ice_ring *rx_ring, + unsigned int size) +{ + unsigned int truesize; + +#if (PAGE_SIZE < 8192) + truesize = ice_rx_pg_size(rx_ring) / 2; /* Must be power-of-2 */ +#else + truesize = ice_rx_offset(rx_ring) ? + SKB_DATA_ALIGN(ice_rx_offset(rx_ring) + size) + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) : + SKB_DATA_ALIGN(size); +#endif + return truesize; +} + /** * ice_run_xdp - Executes an XDP program on initialized xdp_buff * @rx_ring: Rx ring @@ -991,6 +1007,10 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget) bool failure; xdp.rxq = &rx_ring->xdp_rxq; + /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */ +#if (PAGE_SIZE < 8192) + xdp.frame_sz = ice_rx_frame_truesize(rx_ring, 0); +#endif /* start the loop to process Rx packets bounded by 'budget' */ while (likely(total_rx_pkts < (unsigned int)budget)) { @@ -1038,6 +1058,10 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget) xdp.data_hard_start = xdp.data - ice_rx_offset(rx_ring); xdp.data_meta = xdp.data; xdp.data_end = xdp.data + size; +#if (PAGE_SIZE > 4096) + /* At larger PAGE_SIZE, frame_sz depend on len size */ + xdp.frame_sz = ice_rx_frame_truesize(rx_ring, size); +#endif rcu_read_lock(); xdp_prog = READ_ONCE(rx_ring->xdp_prog); @@ -1051,16 +1075,8 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget) if (!xdp_res) goto construct_skb; if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR)) { - unsigned int truesize; - -#if (PAGE_SIZE < 8192) - truesize = ice_rx_pg_size(rx_ring) / 2; -#else - truesize = SKB_DATA_ALIGN(ice_rx_offset(rx_ring) + - size); -#endif xdp_xmit |= xdp_res; - ice_rx_buf_adjust_pg_offset(rx_buf, truesize); + ice_rx_buf_adjust_pg_offset(rx_buf, xdp.frame_sz); } else { rx_buf->pagecnt_bias++; } diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 8279db15e870..23e5515d4527 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -840,11 +840,13 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget) { unsigned int total_rx_bytes = 0, total_rx_packets = 0; u16 cleaned_count = ICE_DESC_UNUSED(rx_ring); + struct xdp_umem *umem = rx_ring->xsk_umem; unsigned int xdp_xmit = 0; bool failure = false; struct xdp_buff xdp; xdp.rxq = &rx_ring->xdp_rxq; + xdp.frame_sz = xsk_umem_xdp_frame_sz(umem); while (likely(total_rx_packets < (unsigned int)budget)) { union ice_32b_rx_flex_desc *rx_desc; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 718931d951bc..eab5934b04f5 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -2244,19 +2244,30 @@ xdp_out: return ERR_PTR(-result); } +static unsigned int ixgbe_rx_frame_truesize(struct ixgbe_ring *rx_ring, + unsigned int size) +{ + unsigned int truesize; + +#if (PAGE_SIZE < 8192) + truesize = ixgbe_rx_pg_size(rx_ring) / 2; /* Must be power-of-2 */ +#else + truesize = ring_uses_build_skb(rx_ring) ? + SKB_DATA_ALIGN(IXGBE_SKB_PAD + size) + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) : + SKB_DATA_ALIGN(size); +#endif + return truesize; +} + static void ixgbe_rx_buffer_flip(struct ixgbe_ring *rx_ring, struct ixgbe_rx_buffer *rx_buffer, unsigned int size) { + unsigned int truesize = ixgbe_rx_frame_truesize(rx_ring, size); #if (PAGE_SIZE < 8192) - unsigned int truesize = ixgbe_rx_pg_size(rx_ring) / 2; - rx_buffer->page_offset ^= truesize; #else - unsigned int truesize = ring_uses_build_skb(rx_ring) ? - SKB_DATA_ALIGN(IXGBE_SKB_PAD + size) : - SKB_DATA_ALIGN(size); - rx_buffer->page_offset += truesize; #endif } @@ -2290,6 +2301,11 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, xdp.rxq = &rx_ring->xdp_rxq; + /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */ +#if (PAGE_SIZE < 8192) + xdp.frame_sz = ixgbe_rx_frame_truesize(rx_ring, 0); +#endif + while (likely(total_rx_packets < budget)) { union ixgbe_adv_rx_desc *rx_desc; struct ixgbe_rx_buffer *rx_buffer; @@ -2323,7 +2339,10 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, xdp.data_hard_start = xdp.data - ixgbe_rx_offset(rx_ring); xdp.data_end = xdp.data + size; - +#if (PAGE_SIZE > 4096) + /* At larger PAGE_SIZE, frame_sz depend on len size */ + xdp.frame_sz = ixgbe_rx_frame_truesize(rx_ring, size); +#endif skb = ixgbe_run_xdp(adapter, rx_ring, &xdp); } diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c index 74b540ebb3dc..a656ee9a1fae 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c @@ -431,12 +431,14 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector, unsigned int total_rx_bytes = 0, total_rx_packets = 0; struct ixgbe_adapter *adapter = q_vector->adapter; u16 cleaned_count = ixgbe_desc_unused(rx_ring); + struct xdp_umem *umem = rx_ring->xsk_umem; unsigned int xdp_res, xdp_xmit = 0; bool failure = false; struct sk_buff *skb; struct xdp_buff xdp; xdp.rxq = &rx_ring->xdp_rxq; + xdp.frame_sz = xsk_umem_xdp_frame_sz(umem); while (likely(total_rx_packets < budget)) { union ixgbe_adv_rx_desc *rx_desc; diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 4622c4ea2e46..a39e2cb384dd 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -1095,19 +1095,31 @@ xdp_out: return ERR_PTR(-result); } +static unsigned int ixgbevf_rx_frame_truesize(struct ixgbevf_ring *rx_ring, + unsigned int size) +{ + unsigned int truesize; + +#if (PAGE_SIZE < 8192) + truesize = ixgbevf_rx_pg_size(rx_ring) / 2; /* Must be power-of-2 */ +#else + truesize = ring_uses_build_skb(rx_ring) ? + SKB_DATA_ALIGN(IXGBEVF_SKB_PAD + size) + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) : + SKB_DATA_ALIGN(size); +#endif + return truesize; +} + static void ixgbevf_rx_buffer_flip(struct ixgbevf_ring *rx_ring, struct ixgbevf_rx_buffer *rx_buffer, unsigned int size) { -#if (PAGE_SIZE < 8192) - unsigned int truesize = ixgbevf_rx_pg_size(rx_ring) / 2; + unsigned int truesize = ixgbevf_rx_frame_truesize(rx_ring, size); +#if (PAGE_SIZE < 8192) rx_buffer->page_offset ^= truesize; #else - unsigned int truesize = ring_uses_build_skb(rx_ring) ? - SKB_DATA_ALIGN(IXGBEVF_SKB_PAD + size) : - SKB_DATA_ALIGN(size); - rx_buffer->page_offset += truesize; #endif } @@ -1125,6 +1137,11 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, xdp.rxq = &rx_ring->xdp_rxq; + /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */ +#if (PAGE_SIZE < 8192) + xdp.frame_sz = ixgbevf_rx_frame_truesize(rx_ring, 0); +#endif + while (likely(total_rx_packets < budget)) { struct ixgbevf_rx_buffer *rx_buffer; union ixgbe_adv_rx_desc *rx_desc; @@ -1157,7 +1174,10 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, xdp.data_hard_start = xdp.data - ixgbevf_rx_offset(rx_ring); xdp.data_end = xdp.data + size; - +#if (PAGE_SIZE > 4096) + /* At larger PAGE_SIZE, frame_sz depend on len size */ + xdp.frame_sz = ixgbevf_rx_frame_truesize(rx_ring, size); +#endif skb = ixgbevf_run_xdp(adapter, rx_ring, &xdp); } diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index e0e9e56830c0..41d2a0eac5fa 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -2148,12 +2148,17 @@ mvneta_run_xdp(struct mvneta_port *pp, struct mvneta_rx_queue *rxq, struct bpf_prog *prog, struct xdp_buff *xdp, struct mvneta_stats *stats) { - unsigned int len; + unsigned int len, sync; + struct page *page; u32 ret, act; len = xdp->data_end - xdp->data_hard_start - pp->rx_offset_correction; act = bpf_prog_run_xdp(prog, xdp); + /* Due xdp_adjust_tail: DMA sync for_device cover max len CPU touch */ + sync = xdp->data_end - xdp->data_hard_start - pp->rx_offset_correction; + sync = max(sync, len); + switch (act) { case XDP_PASS: stats->xdp_pass++; @@ -2164,9 +2169,8 @@ mvneta_run_xdp(struct mvneta_port *pp, struct mvneta_rx_queue *rxq, err = xdp_do_redirect(pp->dev, xdp, prog); if (unlikely(err)) { ret = MVNETA_XDP_DROPPED; - page_pool_put_page(rxq->page_pool, - virt_to_head_page(xdp->data), len, - true); + page = virt_to_head_page(xdp->data); + page_pool_put_page(rxq->page_pool, page, sync, true); } else { ret = MVNETA_XDP_REDIR; stats->xdp_redirect++; @@ -2175,10 +2179,10 @@ mvneta_run_xdp(struct mvneta_port *pp, struct mvneta_rx_queue *rxq, } case XDP_TX: ret = mvneta_xdp_xmit_back(pp, xdp); - if (ret != MVNETA_XDP_TX) - page_pool_put_page(rxq->page_pool, - virt_to_head_page(xdp->data), len, - true); + if (ret != MVNETA_XDP_TX) { + page = virt_to_head_page(xdp->data); + page_pool_put_page(rxq->page_pool, page, sync, true); + } break; default: bpf_warn_invalid_xdp_action(act); @@ -2187,8 +2191,8 @@ mvneta_run_xdp(struct mvneta_port *pp, struct mvneta_rx_queue *rxq, trace_xdp_exception(pp->dev, prog, act); /* fall through */ case XDP_DROP: - page_pool_put_page(rxq->page_pool, - virt_to_head_page(xdp->data), len, true); + page = virt_to_head_page(xdp->data); + page_pool_put_page(rxq->page_pool, page, sync, true); ret = MVNETA_XDP_DROPPED; stats->xdp_drop++; break; @@ -2320,6 +2324,7 @@ static int mvneta_rx_swbm(struct napi_struct *napi, rcu_read_lock(); xdp_prog = READ_ONCE(pp->xdp_prog); xdp_buf.rxq = &rxq->xdp_rxq; + xdp_buf.frame_sz = PAGE_SIZE; /* Fairness NAPI loop */ while (rx_proc < budget && rx_proc < rx_todo) { diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 43dcbd8214c6..5bd3cd37d50f 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -51,7 +51,8 @@ #include "en_port.h" #define MLX4_EN_MAX_XDP_MTU ((int)(PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN) - \ - XDP_PACKET_HEADROOM)) + XDP_PACKET_HEADROOM - \ + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))) int mlx4_en_setup_tc(struct net_device *dev, u8 up) { diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 787139219813..8a10285b0e10 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -683,6 +683,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud rcu_read_lock(); xdp_prog = rcu_dereference(ring->xdp_prog); xdp.rxq = &ring->xdp_rxq; + xdp.frame_sz = priv->frag_info[0].frag_stride; doorbell_pending = 0; /* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 3bd64c63865b..26911b15f8fe 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -625,6 +625,7 @@ struct mlx5e_rq { struct { u16 umem_headroom; u16 headroom; + u32 frame0_sz; u8 map_dir; /* dma map direction */ } buff; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index c4a7fb4ecd14..761c8979bd41 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -137,6 +137,7 @@ bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di, if (xsk) xdp.handle = di->xsk.handle; xdp.rxq = &rq->xdp_rxq; + xdp.frame_sz = rq->buff.frame0_sz; act = bpf_prog_run_xdp(prog, &xdp); if (xsk) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 0a9dfc31de3e..0e4ca08ddca9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -462,6 +462,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, rq->mpwqe.num_strides = BIT(mlx5e_mpwqe_get_log_num_strides(mdev, params, xsk)); + rq->buff.frame0_sz = (1 << rq->mpwqe.log_stride_sz); + err = mlx5e_create_rq_umr_mkey(mdev, rq); if (err) goto err_rq_wq_destroy; @@ -485,6 +487,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, num_xsk_frames = wq_sz << rq->wqe.info.log_num_frags; rq->wqe.info = rqp->frags_info; + rq->buff.frame0_sz = rq->wqe.info.arr[0].frag_stride; + rq->wqe.frags = kvzalloc_node(array_size(sizeof(*rq->wqe.frags), (wq_sz << rq->wqe.info.log_num_frags)), @@ -522,6 +526,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, } if (xsk) { + rq->buff.frame0_sz = xsk_umem_xdp_frame_sz(umem); + err = mlx5e_xsk_resize_reuseq(umem, num_xsk_frames); if (unlikely(err)) { mlx5_core_err(mdev, "Unable to allocate the Reuse Ring for %u frames\n", diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 779600bebcca..821f94beda7a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1070,6 +1070,7 @@ mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, if (consumed) return NULL; /* page/packet was consumed by XDP */ + frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt); skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt); if (unlikely(!skb)) return NULL; @@ -1371,6 +1372,7 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, return NULL; /* page/packet was consumed by XDP */ } + frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt32); skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt32); if (unlikely(!skb)) return NULL; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 9bfb3b077bc1..0e0cc3d58bdc 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -1741,10 +1741,15 @@ nfp_net_tx_xdp_buf(struct nfp_net_dp *dp, struct nfp_net_rx_ring *rx_ring, struct nfp_net_rx_buf *rxbuf, unsigned int dma_off, unsigned int pkt_len, bool *completed) { + unsigned int dma_map_sz = dp->fl_bufsz - NFP_NET_RX_BUF_NON_DATA; struct nfp_net_tx_buf *txbuf; struct nfp_net_tx_desc *txd; int wr_idx; + /* Reject if xdp_adjust_tail grow packet beyond DMA area */ + if (pkt_len + dma_off > dma_map_sz) + return false; + if (unlikely(nfp_net_tx_full(tx_ring, 1))) { if (!*completed) { nfp_net_xdp_complete(tx_ring); @@ -1817,6 +1822,7 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget) rcu_read_lock(); xdp_prog = READ_ONCE(dp->xdp_prog); true_bufsz = xdp_prog ? PAGE_SIZE : dp->fl_bufsz; + xdp.frame_sz = PAGE_SIZE - NFP_NET_RX_BUF_HEADROOM; xdp.rxq = &rx_ring->xdp_rxq; tx_ring = r_vec->xdp_ring; diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c index c6c20776b474..7598ebe0962a 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_fp.c +++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c @@ -1066,6 +1066,7 @@ static bool qede_rx_xdp(struct qede_dev *edev, xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + *len; xdp.rxq = &rxq->xdp_rxq; + xdp.frame_sz = rxq->rx_buf_seg_size; /* PAGE_SIZE when XDP enabled */ /* Queues always have a full reset currently, so for the time * being until there's atomic program replace just mark read diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index f50d9a9b76be..b2d154258b07 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -1476,7 +1476,7 @@ static int qede_alloc_mem_rxq(struct qede_dev *edev, struct qede_rx_queue *rxq) if (rxq->rx_buf_size + size > PAGE_SIZE) rxq->rx_buf_size = PAGE_SIZE - size; - /* Segment size to spilt a page in multiple equal parts , + /* Segment size to split a page in multiple equal parts, * unless XDP is used in which case we'd use the entire page. */ if (!edev->xdp_prog) { diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c index 260352d97d9d..68c47a8c71df 100644 --- a/drivers/net/ethernet/sfc/rx.c +++ b/drivers/net/ethernet/sfc/rx.c @@ -308,6 +308,7 @@ static bool efx_do_xdp(struct efx_nic *efx, struct efx_channel *channel, xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + rx_buf->len; xdp.rxq = &rx_queue->xdp_rxq_info; + xdp.frame_sz = efx->rx_page_buf_step; xdp_act = bpf_prog_run_xdp(xdp_prog, &xdp); rcu_read_unlock(); diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c index a5a0fb60193a..e1f4be4b3d69 100644 --- a/drivers/net/ethernet/socionext/netsec.c +++ b/drivers/net/ethernet/socionext/netsec.c @@ -884,23 +884,28 @@ static u32 netsec_run_xdp(struct netsec_priv *priv, struct bpf_prog *prog, struct xdp_buff *xdp) { struct netsec_desc_ring *dring = &priv->desc_ring[NETSEC_RING_RX]; - unsigned int len = xdp->data_end - xdp->data; + unsigned int sync, len = xdp->data_end - xdp->data; u32 ret = NETSEC_XDP_PASS; + struct page *page; int err; u32 act; act = bpf_prog_run_xdp(prog, xdp); + /* Due xdp_adjust_tail: DMA sync for_device cover max len CPU touch */ + sync = xdp->data_end - xdp->data_hard_start - NETSEC_RXBUF_HEADROOM; + sync = max(sync, len); + switch (act) { case XDP_PASS: ret = NETSEC_XDP_PASS; break; case XDP_TX: ret = netsec_xdp_xmit_back(priv, xdp); - if (ret != NETSEC_XDP_TX) - page_pool_put_page(dring->page_pool, - virt_to_head_page(xdp->data), len, - true); + if (ret != NETSEC_XDP_TX) { + page = virt_to_head_page(xdp->data); + page_pool_put_page(dring->page_pool, page, sync, true); + } break; case XDP_REDIRECT: err = xdp_do_redirect(priv->ndev, xdp, prog); @@ -908,9 +913,8 @@ static u32 netsec_run_xdp(struct netsec_priv *priv, struct bpf_prog *prog, ret = NETSEC_XDP_REDIR; } else { ret = NETSEC_XDP_CONSUMED; - page_pool_put_page(dring->page_pool, - virt_to_head_page(xdp->data), len, - true); + page = virt_to_head_page(xdp->data); + page_pool_put_page(dring->page_pool, page, sync, true); } break; default: @@ -921,8 +925,8 @@ static u32 netsec_run_xdp(struct netsec_priv *priv, struct bpf_prog *prog, /* fall through -- handle aborts by dropping packet */ case XDP_DROP: ret = NETSEC_XDP_CONSUMED; - page_pool_put_page(dring->page_pool, - virt_to_head_page(xdp->data), len, true); + page = virt_to_head_page(xdp->data); + page_pool_put_page(dring->page_pool, page, sync, true); break; } @@ -936,10 +940,14 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget) struct netsec_rx_pkt_info rx_info; enum dma_data_direction dma_dir; struct bpf_prog *xdp_prog; + struct xdp_buff xdp; u16 xdp_xmit = 0; u32 xdp_act = 0; int done = 0; + xdp.rxq = &dring->xdp_rxq; + xdp.frame_sz = PAGE_SIZE; + rcu_read_lock(); xdp_prog = READ_ONCE(priv->xdp_prog); dma_dir = page_pool_get_dma_dir(dring->page_pool); @@ -953,7 +961,6 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget) struct sk_buff *skb = NULL; u16 pkt_len, desc_len; dma_addr_t dma_handle; - struct xdp_buff xdp; void *buf_addr; if (de->attr & (1U << NETSEC_RX_PKT_OWN_FIELD)) { @@ -1002,7 +1009,6 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget) xdp.data = desc->addr + NETSEC_RXBUF_HEADROOM; xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + pkt_len; - xdp.rxq = &dring->xdp_rxq; if (xdp_prog) { xdp_result = netsec_run_xdp(priv, xdp_prog, &xdp); diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index 09f98fa2fb4e..ce0645ada6e7 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -406,6 +406,7 @@ static void cpsw_rx_handler(void *token, int len, int status) xdp.data_hard_start = pa; xdp.rxq = &priv->xdp_rxq[ch]; + xdp.frame_sz = PAGE_SIZE; port = priv->emac_port + cpsw->data.dual_emac; ret = cpsw_run_xdp(priv, ch, &xdp, page, port); diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c index dce49311d3d3..1247d35d42ef 100644 --- a/drivers/net/ethernet/ti/cpsw_new.c +++ b/drivers/net/ethernet/ti/cpsw_new.c @@ -348,6 +348,7 @@ static void cpsw_rx_handler(void *token, int len, int status) xdp.data_hard_start = pa; xdp.rxq = &priv->xdp_rxq[ch]; + xdp.frame_sz = PAGE_SIZE; ret = cpsw_run_xdp(priv, ch, &xdp, page, priv->emac_port); if (ret != CPSW_XDP_PASS) diff --git a/drivers/net/hyperv/netvsc_bpf.c b/drivers/net/hyperv/netvsc_bpf.c index b86611041db6..1e0c024b0a93 100644 --- a/drivers/net/hyperv/netvsc_bpf.c +++ b/drivers/net/hyperv/netvsc_bpf.c @@ -49,6 +49,7 @@ u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan, xdp_set_data_meta_invalid(xdp); xdp->data_end = xdp->data + len; xdp->rxq = &nvchan->xdp_rxq; + xdp->frame_sz = PAGE_SIZE; xdp->handle = 0; memcpy(xdp->data, data, len); diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 5de57fc3ec60..6267f706e8ee 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -795,7 +795,7 @@ static struct sk_buff *netvsc_alloc_recv_skb(struct net_device *net, if (xbuf) { unsigned int hdroom = xdp->data - xdp->data_hard_start; unsigned int xlen = xdp->data_end - xdp->data; - unsigned int frag_size = netvsc_xdp_fraglen(hdroom + xlen); + unsigned int frag_size = xdp->frame_sz; skb = build_skb(xbuf, frag_size); diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 44889eba1dbc..c54f967e2c66 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1671,6 +1671,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + len; xdp.rxq = &tfile->xdp_rxq; + xdp.frame_sz = buflen; act = bpf_prog_run_xdp(xdp_prog, &xdp); if (act == XDP_REDIRECT || act == XDP_TX) { @@ -2411,6 +2412,7 @@ static int tun_xdp_one(struct tun_struct *tun, } xdp_set_data_meta_invalid(xdp); xdp->rxq = &tfile->xdp_rxq; + xdp->frame_sz = buflen; act = bpf_prog_run_xdp(xdp_prog, xdp); err = tun_xdp_act(tun, xdp_prog, xdp, act); diff --git a/drivers/net/veth.c b/drivers/net/veth.c index aece0e5eec8c..b586d2fa5551 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -405,10 +405,6 @@ static struct sk_buff *veth_build_skb(void *head, int headroom, int len, { struct sk_buff *skb; - if (!buflen) { - buflen = SKB_DATA_ALIGN(headroom + len) + - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - } skb = build_skb(head, buflen); if (!skb) return NULL; @@ -564,13 +560,15 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, struct veth_stats *stats) { void *hard_start = frame->data - frame->headroom; - void *head = hard_start - sizeof(struct xdp_frame); int len = frame->len, delta = 0; struct xdp_frame orig_frame; struct bpf_prog *xdp_prog; unsigned int headroom; struct sk_buff *skb; + /* bpf_xdp_adjust_head() assures BPF cannot access xdp_frame area */ + hard_start -= sizeof(struct xdp_frame); + rcu_read_lock(); xdp_prog = rcu_dereference(rq->xdp_prog); if (likely(xdp_prog)) { @@ -581,6 +579,7 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, xdp.data = frame->data; xdp.data_end = frame->data + frame->len; xdp.data_meta = frame->data - frame->metasize; + xdp.frame_sz = frame->frame_sz; xdp.rxq = &rq->xdp_rxq; act = bpf_prog_run_xdp(xdp_prog, &xdp); @@ -592,7 +591,6 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, break; case XDP_TX: orig_frame = *frame; - xdp.data_hard_start = head; xdp.rxq->mem = frame->mem; if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) { trace_xdp_exception(rq->dev, xdp_prog, act); @@ -605,7 +603,6 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, goto xdp_xmit; case XDP_REDIRECT: orig_frame = *frame; - xdp.data_hard_start = head; xdp.rxq->mem = frame->mem; if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { frame = &orig_frame; @@ -629,7 +626,7 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, rcu_read_unlock(); headroom = sizeof(struct xdp_frame) + frame->headroom - delta; - skb = veth_build_skb(head, headroom, len, 0); + skb = veth_build_skb(hard_start, headroom, len, frame->frame_sz); if (!skb) { xdp_return_frame(frame); stats->rx_drops++; @@ -695,9 +692,8 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, goto drop; } - nskb = veth_build_skb(head, - VETH_XDP_HEADROOM + mac_len, skb->len, - PAGE_SIZE); + nskb = veth_build_skb(head, VETH_XDP_HEADROOM + mac_len, + skb->len, PAGE_SIZE); if (!nskb) { page_frag_free(head); goto drop; @@ -715,6 +711,11 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, xdp.data_end = xdp.data + pktlen; xdp.data_meta = xdp.data; xdp.rxq = &rq->xdp_rxq; + + /* SKB "head" area always have tailroom for skb_shared_info */ + xdp.frame_sz = (void *)skb_end_pointer(skb) - xdp.data_hard_start; + xdp.frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + orig_data = xdp.data; orig_data_end = xdp.data_end; @@ -758,6 +759,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, } rcu_read_unlock(); + /* check if bpf_xdp_adjust_head was used */ delta = orig_data - xdp.data; off = mac_len + delta; if (off > 0) @@ -765,9 +767,11 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, else if (off < 0) __skb_pull(skb, -off); skb->mac_header -= delta; + + /* check if bpf_xdp_adjust_tail was used */ off = xdp.data_end - orig_data_end; if (off != 0) - __skb_put(skb, off); + __skb_put(skb, off); /* positive on grow, negative on shrink */ skb->protocol = eth_type_trans(skb, rq->dev); metalen = xdp.data - xdp.data_meta; diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 11f722460513..9e1b5d748586 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -689,6 +689,7 @@ static struct sk_buff *receive_small(struct net_device *dev, xdp.data_end = xdp.data + len; xdp.data_meta = xdp.data; xdp.rxq = &rq->xdp_rxq; + xdp.frame_sz = buflen; orig_data = xdp.data; act = bpf_prog_run_xdp(xdp_prog, &xdp); stats->xdp_packets++; @@ -797,10 +798,11 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, int offset = buf - page_address(page); struct sk_buff *head_skb, *curr_skb; struct bpf_prog *xdp_prog; - unsigned int truesize; + unsigned int truesize = mergeable_ctx_to_truesize(ctx); unsigned int headroom = mergeable_ctx_to_headroom(ctx); - int err; unsigned int metasize = 0; + unsigned int frame_sz; + int err; head_skb = NULL; stats->bytes += len - vi->hdr_len; @@ -821,6 +823,11 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, if (unlikely(hdr->hdr.gso_type)) goto err_xdp; + /* Buffers with headroom use PAGE_SIZE as alloc size, + * see add_recvbuf_mergeable() + get_mergeable_buf_len() + */ + frame_sz = headroom ? PAGE_SIZE : truesize; + /* This happens when rx buffer size is underestimated * or headroom is not enough because of the buffer * was refilled before XDP is set. This should only @@ -834,6 +841,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, page, offset, VIRTIO_XDP_HEADROOM, &len); + frame_sz = PAGE_SIZE; + if (!xdp_page) goto err_xdp; offset = VIRTIO_XDP_HEADROOM; @@ -850,6 +859,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, xdp.data_end = xdp.data + (len - vi->hdr_len); xdp.data_meta = xdp.data; xdp.rxq = &rq->xdp_rxq; + xdp.frame_sz = frame_sz - vi->hdr_len; act = bpf_prog_run_xdp(xdp_prog, &xdp); stats->xdp_packets++; @@ -924,7 +934,6 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, } rcu_read_unlock(); - truesize = mergeable_ctx_to_truesize(ctx); if (unlikely(len > truesize)) { pr_debug("%s: rx error: len %u exceeds truesize %lu\n", dev->name, len, (unsigned long)ctx); diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 2927f02cc7e1..516519dcc8ff 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -747,6 +747,7 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq, xdp->data = buf + pad; xdp->data_end = xdp->data + len; hdr->buflen = buflen; + xdp->frame_sz = buflen; --net->refcnt_bias; alloc_frag->offset += buflen; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c45d198ac38c..efe8836b5c48 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -19,6 +19,7 @@ #include <linux/mutex.h> #include <linux/module.h> #include <linux/kallsyms.h> +#include <linux/capability.h> struct bpf_verifier_env; struct bpf_verifier_log; @@ -119,7 +120,7 @@ struct bpf_map { struct bpf_map_memory memory; char name[BPF_OBJ_NAME_LEN]; u32 btf_vmlinux_value_type_id; - bool unpriv_array; + bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ /* 22 bytes hole */ @@ -1095,6 +1096,21 @@ struct bpf_map *bpf_map_get_curr_or_next(u32 *id); extern int sysctl_unprivileged_bpf_disabled; +static inline bool bpf_allow_ptr_leaks(void) +{ + return perfmon_capable(); +} + +static inline bool bpf_bypass_spec_v1(void) +{ + return perfmon_capable(); +} + +static inline bool bpf_bypass_spec_v4(void) +{ + return perfmon_capable(); +} + int bpf_map_new_fd(struct bpf_map *map, int flags); int bpf_prog_new_fd(struct bpf_prog *prog); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 6abd5a778fcd..ea833087e853 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -375,6 +375,9 @@ struct bpf_verifier_env { u32 used_map_cnt; /* number of used maps */ u32 id_gen; /* used to generate unique reg IDs */ bool allow_ptr_leaks; + bool bpf_capable; + bool bypass_spec_v1; + bool bypass_spec_v4; bool seen_direct_write; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ const struct bpf_line_info *prev_linfo; diff --git a/include/linux/capability.h b/include/linux/capability.h index 027d7e4a853b..b4345b38a6be 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -256,6 +256,11 @@ static inline bool perfmon_capable(void) return capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN); } +static inline bool bpf_capable(void) +{ + return capable(CAP_BPF) || capable(CAP_SYS_ADMIN); +} + /* audit system wants to get cap info from files as well */ extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); diff --git a/include/net/xdp.h b/include/net/xdp.h index 3cc6d5d84aa4..3094fccf5a88 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -6,6 +6,8 @@ #ifndef __LINUX_NET_XDP_H__ #define __LINUX_NET_XDP_H__ +#include <linux/skbuff.h> /* skb_shared_info */ + /** * DOC: XDP RX-queue information * @@ -70,13 +72,25 @@ struct xdp_buff { void *data_hard_start; unsigned long handle; struct xdp_rxq_info *rxq; + u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ }; +/* Reserve memory area at end-of data area. + * + * This macro reserves tailroom in the XDP buffer by limiting the + * XDP/BPF data access to data_hard_end. Notice same area (and size) + * is used for XDP_PASS, when constructing the SKB via build_skb(). + */ +#define xdp_data_hard_end(xdp) \ + ((xdp)->data_hard_start + (xdp)->frame_sz - \ + SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) + struct xdp_frame { void *data; u16 len; u16 headroom; - u16 metasize; + u32 metasize:8; + u32 frame_sz:24; /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, * while mem info is valid on remote CPU. */ @@ -91,6 +105,10 @@ static inline void xdp_scrub_frame(struct xdp_frame *frame) frame->dev_rx = NULL; } +/* Avoids inlining WARN macro in fast-path */ +void xdp_warn(const char *msg, const char *func, const int line); +#define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__) + struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp); /* Convert xdp_buff to xdp_frame */ @@ -111,6 +129,12 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) if (unlikely((headroom - metasize) < sizeof(*xdp_frame))) return NULL; + /* Catch if driver didn't reserve tailroom for skb_shared_info */ + if (unlikely(xdp->data_end > xdp_data_hard_end(xdp))) { + XDP_WARN("Driver BUG: missing reserved tailroom"); + return NULL; + } + /* Store info in top of packet */ xdp_frame = xdp->data_hard_start; @@ -118,6 +142,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) xdp_frame->len = xdp->data_end - xdp->data; xdp_frame->headroom = headroom - sizeof(*xdp_frame); xdp_frame->metasize = metasize; + xdp_frame->frame_sz = xdp->frame_sz; /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */ xdp_frame->mem = xdp->rxq->mem; diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 67191ccaab85..abd72de25fa4 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -236,6 +236,12 @@ static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 address, else return address + offset; } + +static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) +{ + return umem->chunk_size_nohr + umem->headroom; +} + #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { @@ -366,6 +372,11 @@ static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle, return 0; } +static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) +{ + return 0; +} + static inline int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) { return -EOPNOTSUPP; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 32cbf36c7729..b9b8a0f63b91 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2015,8 +2015,8 @@ union bpf_attr { * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) * Description * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is - * only possible to shrink the packet as of this writing, - * therefore *delta* must be a negative integer. + * possible to both shrink and grow the packet tail. + * Shrink done via *delta* being a negative integer. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index e58c9636741b..c7372180a0a9 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -274,6 +274,7 @@ struct vfs_ns_cap_data { arbitrary SCSI commands */ /* Allow setting encryption key on loopback filesystem */ /* Allow setting zone reclaim policy */ +/* Allow everything under CAP_BPF and CAP_PERFMON for backward compatibility */ #define CAP_SYS_ADMIN 21 @@ -374,7 +375,38 @@ struct vfs_ns_cap_data { #define CAP_PERFMON 38 -#define CAP_LAST_CAP CAP_PERFMON +/* + * CAP_BPF allows the following BPF operations: + * - Creating all types of BPF maps + * - Advanced verifier features + * - Indirect variable access + * - Bounded loops + * - BPF to BPF function calls + * - Scalar precision tracking + * - Larger complexity limits + * - Dead code elimination + * - And potentially other features + * - Loading BPF Type Format (BTF) data + * - Retrieve xlated and JITed code of BPF programs + * - Use bpf_spin_lock() helper + * + * CAP_PERFMON relaxes the verifier checks further: + * - BPF progs can use of pointer-to-integer conversions + * - speculation attack hardening measures are bypassed + * - bpf_probe_read to read arbitrary kernel memory is allowed + * - bpf_trace_printk to print kernel memory is allowed + * + * CAP_SYS_ADMIN is required to use bpf_probe_write_user. + * + * CAP_SYS_ADMIN is required to iterate system wide loaded + * programs, maps, links, BTFs and convert their IDs to file descriptors. + * + * CAP_PERFMON and CAP_BPF are required to load tracing programs. + * CAP_NET_ADMIN and CAP_BPF are required to load networking programs. + */ +#define CAP_BPF 39 + +#define CAP_LAST_CAP CAP_BPF #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 95d77770353c..1d5bb0d983b2 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -77,7 +77,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int ret, numa_node = bpf_map_attr_numa_node(attr); u32 elem_size, index_mask, max_entries; - bool unpriv = !capable(CAP_SYS_ADMIN); + bool bypass_spec_v1 = bpf_bypass_spec_v1(); u64 cost, array_size, mask64; struct bpf_map_memory mem; struct bpf_array *array; @@ -95,7 +95,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) mask64 -= 1; index_mask = mask64; - if (unpriv) { + if (!bypass_spec_v1) { /* round up array size to nearest power of 2, * since cpu will speculate within index_mask limits */ @@ -149,7 +149,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); } array->index_mask = index_mask; - array->map.unpriv_array = unpriv; + array->map.bypass_spec_v1 = bypass_spec_v1; /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); @@ -219,7 +219,7 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - if (map->unpriv_array) { + if (!map->bypass_spec_v1) { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4); *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); } else { @@ -1053,7 +1053,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map, *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - if (map->unpriv_array) { + if (!map->bypass_spec_v1) { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6); *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); } else { diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 26cb51f2db72..c6b0decaa46a 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -557,7 +557,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) struct bpf_map *map; int err; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 6aa11de67315..c40ff4cf9880 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -646,7 +646,7 @@ static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) void bpf_prog_kallsyms_add(struct bpf_prog *fp) { if (!bpf_prog_kallsyms_candidate(fp) || - !capable(CAP_SYS_ADMIN)) + !bpf_capable()) return; bpf_prog_ksym_set_addr(fp); diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 3fe0b006d2d2..8b85bfddfac7 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -85,7 +85,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) u64 cost; int ret; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); /* check sanity of attributes */ @@ -162,25 +162,10 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, /* Part of headroom was reserved to xdpf */ hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom; - /* build_skb need to place skb_shared_info after SKB end, and - * also want to know the memory "truesize". Thus, need to - * know the memory frame size backing xdp_buff. - * - * XDP was designed to have PAGE_SIZE frames, but this - * assumption is not longer true with ixgbe and i40e. It - * would be preferred to set frame_size to 2048 or 4096 - * depending on the driver. - * frame_size = 2048; - * frame_len = frame_size - sizeof(*xdp_frame); - * - * Instead, with info avail, skb_shared_info in placed after - * packet len. This, unfortunately fakes the truesize. - * Another disadvantage of this approach, the skb_shared_info - * is not at a fixed memory location, with mixed length - * packets, which is bad for cache-line hotness. + /* Memory size backing xdp_frame data already have reserved + * room for build_skb to place skb_shared_info in tailroom. */ - frame_size = SKB_DATA_ALIGN(xdpf->len + hard_start_headroom) + - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + frame_size = xdpf->frame_sz; pkt_data_start = xdpf->data - hard_start_headroom; skb = build_skb_around(skb, pkt_data_start, frame_size); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d541c8486c95..b4b288a3c3c9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -359,9 +359,9 @@ static int htab_map_alloc_check(union bpf_attr *attr) BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != offsetof(struct htab_elem, hash_node.pprev)); - if (lru && !capable(CAP_SYS_ADMIN)) + if (lru && !bpf_capable()) /* LRU implementation is much complicated than other - * maps. Hence, limit to CAP_SYS_ADMIN for now. + * maps. Hence, limit to CAP_BPF. */ return -EPERM; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 5c0290e0696e..886949fdcece 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -633,7 +633,7 @@ bpf_base_func_proto(enum bpf_func_id func_id) break; } - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return NULL; switch (func_id) { @@ -642,6 +642,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_spin_unlock: return &bpf_spin_unlock_proto; case BPF_FUNC_trace_printk: + if (!perfmon_capable()) + return NULL; return bpf_get_trace_printk_proto(); case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 65c236cf341e..c8cc4e4cf98d 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -543,7 +543,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) u64 cost = sizeof(*trie), cost_per_node; int ret; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); /* check sanity of attributes */ diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index b3c48d1533cb..17738c93bec8 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -60,7 +60,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) /* Misc members not needed in bpf_map_meta_equal() check. */ inner_map_meta->ops = inner_map->ops; if (inner_map->ops == &array_map_ops) { - inner_map_meta->unpriv_array = inner_map->unpriv_array; + inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1; container_of(inner_map_meta, struct bpf_array, map)->index_mask = container_of(inner_map, struct bpf_array, map)->index_mask; } diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 30e1373fd437..05c8e043b9d2 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -45,7 +45,7 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) /* Called from syscall */ static int queue_stack_map_alloc_check(union bpf_attr *attr) { - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return -EPERM; /* check sanity of attributes */ diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 01badd3eda7a..21cde24386db 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -154,7 +154,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) struct bpf_map_memory mem; u64 array_size; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); array_size = sizeof(*array); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index db76339fe358..7b8381ce40a0 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -93,7 +93,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) u64 cost, n_buckets; int err; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); if (attr->map_flags & ~STACK_CREATE_FLAG_MASK) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index de2a75500233..79bcd8d056d2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1534,7 +1534,7 @@ static int map_freeze(const union bpf_attr *attr) err = -EBUSY; goto err_put; } - if (!capable(CAP_SYS_ADMIN)) { + if (!bpf_capable()) { err = -EPERM; goto err_put; } @@ -2009,6 +2009,55 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, } } +static bool is_net_admin_prog_type(enum bpf_prog_type prog_type) +{ + switch (prog_type) { + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: + case BPF_PROG_TYPE_XDP: + case BPF_PROG_TYPE_LWT_IN: + case BPF_PROG_TYPE_LWT_OUT: + case BPF_PROG_TYPE_LWT_XMIT: + case BPF_PROG_TYPE_LWT_SEG6LOCAL: + case BPF_PROG_TYPE_SK_SKB: + case BPF_PROG_TYPE_SK_MSG: + case BPF_PROG_TYPE_LIRC_MODE2: + case BPF_PROG_TYPE_FLOW_DISSECTOR: + case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_EXT: /* extends any prog */ + return true; + case BPF_PROG_TYPE_CGROUP_SKB: + /* always unpriv */ + case BPF_PROG_TYPE_SK_REUSEPORT: + /* equivalent to SOCKET_FILTER. need CAP_BPF only */ + default: + return false; + } +} + +static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) +{ + switch (prog_type) { + case BPF_PROG_TYPE_KPROBE: + case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */ + case BPF_PROG_TYPE_EXT: /* extends any prog */ + return true; + default: + return false; + } +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD attach_prog_fd @@ -2031,7 +2080,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && (attr->prog_flags & BPF_F_ANY_ALIGNMENT) && - !capable(CAP_SYS_ADMIN)) + !bpf_capable()) return -EPERM; /* copy eBPF program license from user space */ @@ -2044,11 +2093,16 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) is_gpl = license_is_gpl_compatible(license); if (attr->insn_cnt == 0 || - attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) + attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) return -E2BIG; if (type != BPF_PROG_TYPE_SOCKET_FILTER && type != BPF_PROG_TYPE_CGROUP_SKB && - !capable(CAP_SYS_ADMIN)) + !bpf_capable()) + return -EPERM; + + if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN)) + return -EPERM; + if (is_perfmon_prog_type(type) && !perfmon_capable()) return -EPERM; bpf_prog_load_fixup_attach_type(attr); @@ -2682,6 +2736,11 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, case BPF_PROG_TYPE_CGROUP_SOCKOPT: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; case BPF_PROG_TYPE_CGROUP_SKB: + if (!capable(CAP_NET_ADMIN)) + /* cg-skb progs can be loaded by unpriv user. + * check permissions at attach time. + */ + return -EPERM; return prog->enforce_expected_attach_type && prog->expected_attach_type != attach_type ? -EINVAL : 0; @@ -2747,9 +2806,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) struct bpf_prog *prog; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_PROG_ATTACH)) return -EINVAL; @@ -2804,9 +2860,6 @@ static int bpf_prog_detach(const union bpf_attr *attr) { enum bpf_prog_type ptype; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_PROG_DETACH)) return -EINVAL; @@ -2819,6 +2872,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_PROG_TYPE_LIRC_MODE2: return lirc_prog_detach(attr); case BPF_PROG_TYPE_FLOW_DISSECTOR: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; return skb_flow_dissector_bpf_prog_detach(attr); case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: @@ -2882,8 +2937,6 @@ static int bpf_prog_test_run(const union bpf_attr *attr, struct bpf_prog *prog; int ret = -ENOTSUPP; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; if (CHECK_ATTR(BPF_PROG_TEST_RUN)) return -EINVAL; @@ -3184,7 +3237,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.run_time_ns = stats.nsecs; info.run_cnt = stats.cnt; - if (!capable(CAP_SYS_ADMIN)) { + if (!bpf_capable()) { info.jited_prog_len = 0; info.xlated_prog_len = 0; info.nr_jited_ksyms = 0; @@ -3543,7 +3596,7 @@ static int bpf_btf_load(const union bpf_attr *attr) if (CHECK_ATTR(BPF_BTF_LOAD)) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return -EPERM; return btf_new_fd(attr); @@ -3766,9 +3819,6 @@ static int link_create(union bpf_attr *attr) struct bpf_prog *prog; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_LINK_CREATE)) return -EINVAL; @@ -3817,9 +3867,6 @@ static int link_update(union bpf_attr *attr) u32 flags; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_LINK_UPDATE)) return -EINVAL; @@ -3988,7 +4035,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz union bpf_attr attr; int err; - if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) + if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) return -EPERM; err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a3f2af756fd6..180933f6fba9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1295,7 +1295,7 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env, reg->type = SCALAR_VALUE; reg->var_off = tnum_unknown; reg->frameno = 0; - reg->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks; + reg->precise = env->subprog_cnt > 1 || !env->bpf_capable; __mark_reg_unbounded(reg); } @@ -1427,8 +1427,9 @@ static int check_subprogs(struct bpf_verifier_env *env) continue; if (insn[i].src_reg != BPF_PSEUDO_CALL) continue; - if (!env->allow_ptr_leaks) { - verbose(env, "function calls to other bpf functions are allowed for root only\n"); + if (!env->bpf_capable) { + verbose(env, + "function calls to other bpf functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); return -EPERM; } ret = add_subprog(env, i + insn[i].imm + 1); @@ -1962,8 +1963,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, bool new_marks = false; int i, err; - if (!env->allow_ptr_leaks) - /* backtracking is root only for now */ + if (!env->bpf_capable) return 0; func = st->frame[st->curframe]; @@ -2211,7 +2211,7 @@ static int check_stack_write(struct bpf_verifier_env *env, reg = &cur->regs[value_regno]; if (reg && size == BPF_REG_SIZE && register_is_const(reg) && - !register_is_null(reg) && env->allow_ptr_leaks) { + !register_is_null(reg) && env->bpf_capable) { if (dst_reg != BPF_REG_FP) { /* The backtracking logic can only recognize explicit * stack slot address like [fp - 8]. Other spill of @@ -2237,7 +2237,7 @@ static int check_stack_write(struct bpf_verifier_env *env, return -EINVAL; } - if (!env->allow_ptr_leaks) { + if (!env->bypass_spec_v4) { bool sanitize = false; if (state->stack[spi].slot_type[0] == STACK_SPILL && @@ -3432,7 +3432,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, * Spectre masking for stack ALU. * See also retrieve_ptr_limit(). */ - if (!env->allow_ptr_leaks) { + if (!env->bypass_spec_v1) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); @@ -4435,10 +4435,10 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, if (!BPF_MAP_PTR(aux->map_ptr_state)) bpf_map_ptr_store(aux, meta->map_ptr, - meta->map_ptr->unpriv_array); + !meta->map_ptr->bypass_spec_v1); else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr) bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON, - meta->map_ptr->unpriv_array); + !meta->map_ptr->bypass_spec_v1); return 0; } @@ -4807,7 +4807,7 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env, const struct bpf_insn *insn) { - return env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K; + return env->bypass_spec_v1 || BPF_SRC(insn->code) == BPF_K; } static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux, @@ -5117,7 +5117,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, /* For unprivileged we require that resulting offset must be in bounds * in order to be able to sanitize access later on. */ - if (!env->allow_ptr_leaks) { + if (!env->bypass_spec_v1) { if (dst_reg->type == PTR_TO_MAP_VALUE && check_map_access(env, dst, dst_reg->off, 1, false)) { verbose(env, "R%d pointer arithmetic of map value goes out of range, " @@ -7244,7 +7244,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, insn_stack[env->cfg.cur_stack++] = w; return 1; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { - if (loop_ok && env->allow_ptr_leaks) + if (loop_ok && env->bpf_capable) return 0; verbose_linfo(env, t, "%d: ", t); verbose_linfo(env, w, "%d: ", w); @@ -8353,7 +8353,7 @@ next: if (env->max_states_per_insn < states_cnt) env->max_states_per_insn = states_cnt; - if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) + if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) return push_jmp_history(env, cur); if (!add_new_state) @@ -10014,7 +10014,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->code = BPF_JMP | BPF_TAIL_CALL; aux = &env->insn_aux_data[i + delta]; - if (env->allow_ptr_leaks && !expect_blinding && + if (env->bpf_capable && !expect_blinding && prog->jit_requested && !bpf_map_key_poisoned(aux) && !bpf_map_ptr_poisoned(aux) && @@ -10758,7 +10758,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->insn_aux_data[i].orig_idx = i; env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; - is_priv = capable(CAP_SYS_ADMIN); + is_priv = bpf_capable(); if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { mutex_lock(&bpf_verifier_lock); @@ -10799,7 +10799,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (attr->prog_flags & BPF_F_ANY_ALIGNMENT) env->strict_alignment = false; - env->allow_ptr_leaks = is_priv; + env->allow_ptr_leaks = bpf_allow_ptr_leaks(); + env->bypass_spec_v1 = bpf_bypass_spec_v1(); + env->bypass_spec_v4 = bpf_bypass_spec_v4(); + env->bpf_capable = bpf_capable(); if (is_priv) env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d961428fb5b6..9a84d7fb4869 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -315,6 +315,9 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = { static const struct bpf_func_proto *bpf_get_probe_write_proto(void) { + if (!capable(CAP_SYS_ADMIN)) + return NULL; + pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!", current->comm, task_pid_nr(current)); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 29dbdd4c29f6..30ba7d38941d 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -470,25 +470,34 @@ out: int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { + u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + u32 headroom = XDP_PACKET_HEADROOM; u32 size = kattr->test.data_size_in; u32 repeat = kattr->test.repeat; struct netdev_rx_queue *rxqueue; struct xdp_buff xdp = {}; u32 retval, duration; + u32 max_data_sz; void *data; int ret; if (kattr->test.ctx_in || kattr->test.ctx_out) return -EINVAL; - data = bpf_test_init(kattr, size, XDP_PACKET_HEADROOM + NET_IP_ALIGN, 0); + /* XDP have extra tailroom as (most) drivers use full page */ + max_data_sz = 4096 - headroom - tailroom; + if (size > max_data_sz) + return -EINVAL; + + data = bpf_test_init(kattr, max_data_sz, headroom, tailroom); if (IS_ERR(data)) return PTR_ERR(data); xdp.data_hard_start = data; - xdp.data = data + XDP_PACKET_HEADROOM + NET_IP_ALIGN; + xdp.data = data + headroom; xdp.data_meta = xdp.data; xdp.data_end = xdp.data + size; + xdp.frame_sz = headroom + max_data_sz + tailroom; rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); xdp.rxq = &rxqueue->xdp_rxq; @@ -496,8 +505,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true); if (ret) goto out; - if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN || - xdp.data_end != xdp.data + size) + if (xdp.data != data + headroom || xdp.data_end != xdp.data + size) size = xdp.data_end - xdp.data; ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration); out: diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 756b63b6f7b3..d2c4d16dadba 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -625,7 +625,7 @@ static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) !attr->btf_key_type_id || !attr->btf_value_type_id) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return -EPERM; if (attr->value_size > MAX_VALUE_SIZE) @@ -978,7 +978,7 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs) /* bpf_sk_storage_map is currently limited to CAP_SYS_ADMIN as * the map_alloc_check() side also does. */ - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); nla_for_each_nested(nla, nla_stgs, rem) { diff --git a/net/core/dev.c b/net/core/dev.c index 4c91de39890a..f937a3ff668d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4617,6 +4617,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, xdp->data_meta = xdp->data; xdp->data_end = xdp->data + hlen; xdp->data_hard_start = skb->data - skb_headroom(skb); + + /* SKB "head" area always have tailroom for skb_shared_info */ + xdp->frame_sz = (void *)skb_end_pointer(skb) - xdp->data_hard_start; + xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + orig_data_end = xdp->data_end; orig_data = xdp->data; eth = (struct ethhdr *)xdp->data; @@ -4640,14 +4645,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, skb_reset_network_header(skb); } - /* check if bpf_xdp_adjust_tail was used. it can only "shrink" - * pckt. - */ - off = orig_data_end - xdp->data_end; + /* check if bpf_xdp_adjust_tail was used */ + off = xdp->data_end - orig_data_end; if (off != 0) { skb_set_tail_pointer(skb, xdp->data_end - xdp->data); - skb->len -= off; - + skb->len += off; /* positive on grow, negative on shrink */ } /* check if XDP changed eth hdr such SKB needs update */ diff --git a/net/core/filter.c b/net/core/filter.c index 5815902bb617..f8a3c7e9d027 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3411,15 +3411,26 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) { + void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */ void *data_end = xdp->data_end + offset; - /* only shrinking is allowed for now. */ - if (unlikely(offset >= 0)) + /* Notice that xdp_data_hard_end have reserved some tailroom */ + if (unlikely(data_end > data_hard_end)) return -EINVAL; + /* ALL drivers MUST init xdp->frame_sz, chicken check below */ + if (unlikely(xdp->frame_sz > PAGE_SIZE)) { + WARN_ONCE(1, "Too BIG xdp->frame_sz = %d\n", xdp->frame_sz); + return -EINVAL; + } + if (unlikely(data_end < xdp->data + ETH_HLEN)) return -EINVAL; + /* Clear memory area on grow, can contain uninit kernel memory */ + if (offset > 0) + memset(xdp->data_end, 0, offset); + xdp->data_end = data_end; return 0; @@ -6676,7 +6687,7 @@ static bool cg_skb_is_valid_access(int off, int size, return false; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return false; break; } @@ -6688,7 +6699,7 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; case bpf_ctx_range(struct __sk_buff, tstamp): - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return false; break; default: diff --git a/net/core/xdp.c b/net/core/xdp.c index 4c7ea85486af..490b8f5fa8ee 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -11,6 +11,7 @@ #include <linux/slab.h> #include <linux/idr.h> #include <linux/rhashtable.h> +#include <linux/bug.h> #include <net/page_pool.h> #include <net/xdp.h> @@ -496,3 +497,10 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp) return xdpf; } EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame); + +/* Used by XDP_WARN macro, to avoid inlining WARN() in fast-path */ +void xdp_warn(const char *msg, const char *func, const int line) +{ + WARN(1, "XDP_WARN: %s(line:%d): %s\n", func, line, msg); +}; +EXPORT_SYMBOL_GPL(xdp_warn); diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index d233ab3f1533..98e1513b608a 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -27,9 +27,9 @@ "audit_control", "setfcap" #define COMMON_CAP2_PERMS "mac_override", "mac_admin", "syslog", \ - "wake_alarm", "block_suspend", "audit_read", "perfmon" + "wake_alarm", "block_suspend", "audit_read", "perfmon", "bpf" -#if CAP_LAST_CAP > CAP_PERFMON +#if CAP_LAST_CAP > CAP_BPF #error New capability defined, please update COMMON_CAP2_PERMS. #endif diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index f54347f55ee0..1b73e63274b5 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -80,13 +80,12 @@ print_bool_feature(const char *feat_name, const char *plain_name, printf("%s is %savailable\n", plain_name, res ? "" : "NOT "); } -static void print_kernel_option(const char *name, const char *value) +static void print_kernel_option(const char *name, const char *value, + const char *define_prefix) { char *endptr; int res; - /* No support for C-style ouptut */ - if (json_output) { if (!value) { jsonw_null_field(json_wtr, name); @@ -98,6 +97,12 @@ static void print_kernel_option(const char *name, const char *value) jsonw_int_field(json_wtr, name, res); else jsonw_string_field(json_wtr, name, value); + } else if (define_prefix) { + if (value) + printf("#define %s%s %s\n", define_prefix, + name, value); + else + printf("/* %s%s is not set */\n", define_prefix, name); } else { if (value) printf("%s is set to %s\n", name, value); @@ -315,77 +320,84 @@ static bool read_next_kernel_config_option(gzFile file, char *buf, size_t n, return false; } -static void probe_kernel_image_config(void) +static void probe_kernel_image_config(const char *define_prefix) { - static const char * const options[] = { + static const struct { + const char * const name; + bool macro_dump; + } options[] = { /* Enable BPF */ - "CONFIG_BPF", + { "CONFIG_BPF", }, /* Enable bpf() syscall */ - "CONFIG_BPF_SYSCALL", + { "CONFIG_BPF_SYSCALL", }, /* Does selected architecture support eBPF JIT compiler */ - "CONFIG_HAVE_EBPF_JIT", + { "CONFIG_HAVE_EBPF_JIT", }, /* Compile eBPF JIT compiler */ - "CONFIG_BPF_JIT", + { "CONFIG_BPF_JIT", }, /* Avoid compiling eBPF interpreter (use JIT only) */ - "CONFIG_BPF_JIT_ALWAYS_ON", + { "CONFIG_BPF_JIT_ALWAYS_ON", }, /* cgroups */ - "CONFIG_CGROUPS", + { "CONFIG_CGROUPS", }, /* BPF programs attached to cgroups */ - "CONFIG_CGROUP_BPF", + { "CONFIG_CGROUP_BPF", }, /* bpf_get_cgroup_classid() helper */ - "CONFIG_CGROUP_NET_CLASSID", + { "CONFIG_CGROUP_NET_CLASSID", }, /* bpf_skb_{,ancestor_}cgroup_id() helpers */ - "CONFIG_SOCK_CGROUP_DATA", + { "CONFIG_SOCK_CGROUP_DATA", }, /* Tracing: attach BPF to kprobes, tracepoints, etc. */ - "CONFIG_BPF_EVENTS", + { "CONFIG_BPF_EVENTS", }, /* Kprobes */ - "CONFIG_KPROBE_EVENTS", + { "CONFIG_KPROBE_EVENTS", }, /* Uprobes */ - "CONFIG_UPROBE_EVENTS", + { "CONFIG_UPROBE_EVENTS", }, /* Tracepoints */ - "CONFIG_TRACING", + { "CONFIG_TRACING", }, /* Syscall tracepoints */ - "CONFIG_FTRACE_SYSCALLS", + { "CONFIG_FTRACE_SYSCALLS", }, /* bpf_override_return() helper support for selected arch */ - "CONFIG_FUNCTION_ERROR_INJECTION", + { "CONFIG_FUNCTION_ERROR_INJECTION", }, /* bpf_override_return() helper */ - "CONFIG_BPF_KPROBE_OVERRIDE", + { "CONFIG_BPF_KPROBE_OVERRIDE", }, /* Network */ - "CONFIG_NET", + { "CONFIG_NET", }, /* AF_XDP sockets */ - "CONFIG_XDP_SOCKETS", + { "CONFIG_XDP_SOCKETS", }, /* BPF_PROG_TYPE_LWT_* and related helpers */ - "CONFIG_LWTUNNEL_BPF", + { "CONFIG_LWTUNNEL_BPF", }, /* BPF_PROG_TYPE_SCHED_ACT, TC (traffic control) actions */ - "CONFIG_NET_ACT_BPF", + { "CONFIG_NET_ACT_BPF", }, /* BPF_PROG_TYPE_SCHED_CLS, TC filters */ - "CONFIG_NET_CLS_BPF", + { "CONFIG_NET_CLS_BPF", }, /* TC clsact qdisc */ - "CONFIG_NET_CLS_ACT", + { "CONFIG_NET_CLS_ACT", }, /* Ingress filtering with TC */ - "CONFIG_NET_SCH_INGRESS", + { "CONFIG_NET_SCH_INGRESS", }, /* bpf_skb_get_xfrm_state() helper */ - "CONFIG_XFRM", + { "CONFIG_XFRM", }, /* bpf_get_route_realm() helper */ - "CONFIG_IP_ROUTE_CLASSID", + { "CONFIG_IP_ROUTE_CLASSID", }, /* BPF_PROG_TYPE_LWT_SEG6_LOCAL and related helpers */ - "CONFIG_IPV6_SEG6_BPF", + { "CONFIG_IPV6_SEG6_BPF", }, /* BPF_PROG_TYPE_LIRC_MODE2 and related helpers */ - "CONFIG_BPF_LIRC_MODE2", + { "CONFIG_BPF_LIRC_MODE2", }, /* BPF stream parser and BPF socket maps */ - "CONFIG_BPF_STREAM_PARSER", + { "CONFIG_BPF_STREAM_PARSER", }, /* xt_bpf module for passing BPF programs to netfilter */ - "CONFIG_NETFILTER_XT_MATCH_BPF", + { "CONFIG_NETFILTER_XT_MATCH_BPF", }, /* bpfilter back-end for iptables */ - "CONFIG_BPFILTER", + { "CONFIG_BPFILTER", }, /* bpftilter module with "user mode helper" */ - "CONFIG_BPFILTER_UMH", + { "CONFIG_BPFILTER_UMH", }, /* test_bpf module for BPF tests */ - "CONFIG_TEST_BPF", + { "CONFIG_TEST_BPF", }, + + /* Misc configs useful in BPF C programs */ + /* jiffies <-> sec conversion for bpf_jiffies64() helper */ + { "CONFIG_HZ", true, } }; char *values[ARRAY_SIZE(options)] = { }; struct utsname utsn; @@ -427,7 +439,8 @@ static void probe_kernel_image_config(void) while (read_next_kernel_config_option(file, buf, sizeof(buf), &value)) { for (i = 0; i < ARRAY_SIZE(options); i++) { - if (values[i] || strcmp(buf, options[i])) + if ((define_prefix && !options[i].macro_dump) || + values[i] || strcmp(buf, options[i].name)) continue; values[i] = strdup(value); @@ -439,7 +452,9 @@ end_parse: gzclose(file); for (i = 0; i < ARRAY_SIZE(options); i++) { - print_kernel_option(options[i], values[i]); + if (define_prefix && !options[i].macro_dump) + continue; + print_kernel_option(options[i].name, values[i], define_prefix); free(values[i]); } } @@ -632,23 +647,22 @@ section_system_config(enum probe_component target, const char *define_prefix) switch (target) { case COMPONENT_KERNEL: case COMPONENT_UNSPEC: - if (define_prefix) - break; - print_start_section("system_config", "Scanning system configuration...", - NULL, /* define_comment never used here */ - NULL); /* define_prefix always NULL here */ - if (check_procfs()) { - probe_unprivileged_disabled(); - probe_jit_enable(); - probe_jit_harden(); - probe_jit_kallsyms(); - probe_jit_limit(); - } else { - p_info("/* procfs not mounted, skipping related probes */"); + "/*** Misc kernel config items ***/", + define_prefix); + if (!define_prefix) { + if (check_procfs()) { + probe_unprivileged_disabled(); + probe_jit_enable(); + probe_jit_harden(); + probe_jit_kallsyms(); + probe_jit_limit(); + } else { + p_info("/* procfs not mounted, skipping related probes */"); + } } - probe_kernel_image_config(); + probe_kernel_image_config(define_prefix); print_end_section(); break; default: diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c index 6c8ca1c93f9b..d5c98f2cb12f 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c @@ -2,13 +2,13 @@ #include <test_progs.h> #include <network_helpers.h> -void test_xdp_adjust_tail(void) +void test_xdp_adjust_tail_shrink(void) { - const char *file = "./test_adjust_tail.o"; + const char *file = "./test_xdp_adjust_tail_shrink.o"; + __u32 duration, retval, size, expect_sz; struct bpf_object *obj; - char buf[128]; - __u32 duration, retval, size; int err, prog_fd; + char buf[128]; err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd); if (CHECK_FAIL(err)) @@ -21,10 +21,121 @@ void test_xdp_adjust_tail(void) "ipv4", "err %d errno %d retval %d size %d\n", err, errno, retval, size); + expect_sz = sizeof(pkt_v6) - 20; /* Test shrink with 20 bytes */ err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6), buf, &size, &retval, &duration); - CHECK(err || retval != XDP_TX || size != 54, - "ipv6", "err %d errno %d retval %d size %d\n", + CHECK(err || retval != XDP_TX || size != expect_sz, + "ipv6", "err %d errno %d retval %d size %d expect-size %d\n", + err, errno, retval, size, expect_sz); + bpf_object__close(obj); +} + +void test_xdp_adjust_tail_grow(void) +{ + const char *file = "./test_xdp_adjust_tail_grow.o"; + struct bpf_object *obj; + char buf[4096]; /* avoid segfault: large buf to hold grow results */ + __u32 duration, retval, size, expect_sz; + int err, prog_fd; + + err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd); + if (CHECK_FAIL(err)) + return; + + err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), + buf, &size, &retval, &duration); + CHECK(err || retval != XDP_DROP, + "ipv4", "err %d errno %d retval %d size %d\n", err, errno, retval, size); + + expect_sz = sizeof(pkt_v6) + 40; /* Test grow with 40 bytes */ + err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6) /* 74 */, + buf, &size, &retval, &duration); + CHECK(err || retval != XDP_TX || size != expect_sz, + "ipv6", "err %d errno %d retval %d size %d expect-size %d\n", + err, errno, retval, size, expect_sz); + + bpf_object__close(obj); +} + +void test_xdp_adjust_tail_grow2(void) +{ + const char *file = "./test_xdp_adjust_tail_grow.o"; + char buf[4096]; /* avoid segfault: large buf to hold grow results */ + int tailroom = 320; /* SKB_DATA_ALIGN(sizeof(struct skb_shared_info))*/; + struct bpf_object *obj; + int err, cnt, i; + int max_grow; + + struct bpf_prog_test_run_attr tattr = { + .repeat = 1, + .data_in = &buf, + .data_out = &buf, + .data_size_in = 0, /* Per test */ + .data_size_out = 0, /* Per test */ + }; + + err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &tattr.prog_fd); + if (CHECK_ATTR(err, "load", "err %d errno %d\n", err, errno)) + return; + + /* Test case-64 */ + memset(buf, 1, sizeof(buf)); + tattr.data_size_in = 64; /* Determine test case via pkt size */ + tattr.data_size_out = 128; /* Limit copy_size */ + /* Kernel side alloc packet memory area that is zero init */ + err = bpf_prog_test_run_xattr(&tattr); + + CHECK_ATTR(errno != ENOSPC /* Due limit copy_size in bpf_test_finish */ + || tattr.retval != XDP_TX + || tattr.data_size_out != 192, /* Expected grow size */ + "case-64", + "err %d errno %d retval %d size %d\n", + err, errno, tattr.retval, tattr.data_size_out); + + /* Extra checks for data contents */ + CHECK_ATTR(tattr.data_size_out != 192 + || buf[0] != 1 || buf[63] != 1 /* 0-63 memset to 1 */ + || buf[64] != 0 || buf[127] != 0 /* 64-127 memset to 0 */ + || buf[128] != 1 || buf[191] != 1, /*128-191 memset to 1 */ + "case-64-data", + "err %d errno %d retval %d size %d\n", + err, errno, tattr.retval, tattr.data_size_out); + + /* Test case-128 */ + memset(buf, 2, sizeof(buf)); + tattr.data_size_in = 128; /* Determine test case via pkt size */ + tattr.data_size_out = sizeof(buf); /* Copy everything */ + err = bpf_prog_test_run_xattr(&tattr); + + max_grow = 4096 - XDP_PACKET_HEADROOM - tailroom; /* 3520 */ + CHECK_ATTR(err + || tattr.retval != XDP_TX + || tattr.data_size_out != max_grow,/* Expect max grow size */ + "case-128", + "err %d errno %d retval %d size %d expect-size %d\n", + err, errno, tattr.retval, tattr.data_size_out, max_grow); + + /* Extra checks for data content: Count grow size, will contain zeros */ + for (i = 0, cnt = 0; i < sizeof(buf); i++) { + if (buf[i] == 0) + cnt++; + } + CHECK_ATTR((cnt != (max_grow - tattr.data_size_in)) /* Grow increase */ + || tattr.data_size_out != max_grow, /* Total grow size */ + "case-128-data", + "err %d errno %d retval %d size %d grow-size %d\n", + err, errno, tattr.retval, tattr.data_size_out, cnt); + bpf_object__close(obj); } + +void test_xdp_adjust_tail(void) +{ + if (test__start_subtest("xdp_adjust_tail_shrink")) + test_xdp_adjust_tail_shrink(); + if (test__start_subtest("xdp_adjust_tail_grow")) + test_xdp_adjust_tail_grow(); + if (test__start_subtest("xdp_adjust_tail_grow2")) + test_xdp_adjust_tail_grow2(); +} diff --git a/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c new file mode 100644 index 000000000000..3d66599eee2e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +SEC("xdp_adjust_tail_grow") +int _xdp_adjust_tail_grow(struct xdp_md *xdp) +{ + void *data_end = (void *)(long)xdp->data_end; + void *data = (void *)(long)xdp->data; + unsigned int data_len; + int offset = 0; + + /* Data length determine test case */ + data_len = data_end - data; + + if (data_len == 54) { /* sizeof(pkt_v4) */ + offset = 4096; /* test too large offset */ + } else if (data_len == 74) { /* sizeof(pkt_v6) */ + offset = 40; + } else if (data_len == 64) { + offset = 128; + } else if (data_len == 128) { + offset = 4096 - 256 - 320 - data_len; /* Max tail grow 3520 */ + } else { + return XDP_ABORTED; /* No matching test */ + } + + if (bpf_xdp_adjust_tail(xdp, offset)) + return XDP_DROP; + return XDP_TX; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_adjust_tail.c b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_shrink.c index b7fc85769bdc..22065a9cfb25 100644 --- a/tools/testing/selftests/bpf/progs/test_adjust_tail.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_shrink.c @@ -1,5 +1,5 @@ -/* SPDX-License-Identifier: GPL-2.0 - * Copyright (c) 2018 Facebook +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2018 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -11,15 +11,15 @@ int _version SEC("version") = 1; -SEC("xdp_adjust_tail") -int _xdp_adjust_tail(struct xdp_md *xdp) +SEC("xdp_adjust_tail_shrink") +int _xdp_adjust_tail_shrink(struct xdp_md *xdp) { void *data_end = (void *)(long)xdp->data_end; void *data = (void *)(long)xdp->data; int offset = 0; - if (data_end - data == 54) - offset = 256; + if (data_end - data == 54) /* sizeof(pkt_v4) */ + offset = 256; /* shrink too much */ else offset = 20; if (bpf_xdp_adjust_tail(xdp, 0 - offset)) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 21a1ce219c1c..78a6bae56ea6 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -818,10 +818,18 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type, } } +struct libcap { + struct __user_cap_header_struct hdr; + struct __user_cap_data_struct data[2]; +}; + static int set_admin(bool admin) { cap_t caps; - const cap_value_t cap_val = CAP_SYS_ADMIN; + /* need CAP_BPF, CAP_NET_ADMIN, CAP_PERFMON to load progs */ + const cap_value_t cap_net_admin = CAP_NET_ADMIN; + const cap_value_t cap_sys_admin = CAP_SYS_ADMIN; + struct libcap *cap; int ret = -1; caps = cap_get_proc(); @@ -829,11 +837,26 @@ static int set_admin(bool admin) perror("cap_get_proc"); return -1; } - if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_val, + cap = (struct libcap *)caps; + if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_sys_admin, CAP_CLEAR)) { + perror("cap_set_flag clear admin"); + goto out; + } + if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_net_admin, admin ? CAP_SET : CAP_CLEAR)) { - perror("cap_set_flag"); + perror("cap_set_flag set_or_clear net"); goto out; } + /* libcap is likely old and simply ignores CAP_BPF and CAP_PERFMON, + * so update effective bits manually + */ + if (admin) { + cap->data[1].effective |= 1 << (38 /* CAP_PERFMON */ - 32); + cap->data[1].effective |= 1 << (39 /* CAP_BPF */ - 32); + } else { + cap->data[1].effective &= ~(1 << (38 - 32)); + cap->data[1].effective &= ~(1 << (39 - 32)); + } if (cap_set_proc(caps)) { perror("cap_set_proc"); goto out; @@ -1067,9 +1090,11 @@ fail_log: static bool is_admin(void) { + cap_flag_value_t net_priv = CAP_CLEAR; + bool perfmon_priv = false; + bool bpf_priv = false; + struct libcap *cap; cap_t caps; - cap_flag_value_t sysadmin = CAP_CLEAR; - const cap_value_t cap_val = CAP_SYS_ADMIN; #ifdef CAP_IS_SUPPORTED if (!CAP_IS_SUPPORTED(CAP_SETFCAP)) { @@ -1082,11 +1107,14 @@ static bool is_admin(void) perror("cap_get_proc"); return false; } - if (cap_get_flag(caps, cap_val, CAP_EFFECTIVE, &sysadmin)) - perror("cap_get_flag"); + cap = (struct libcap *)caps; + bpf_priv = cap->data[1].effective & (1 << (39/* CAP_BPF */ - 32)); + perfmon_priv = cap->data[1].effective & (1 << (38/* CAP_PERFMON */ - 32)); + if (cap_get_flag(caps, CAP_NET_ADMIN, CAP_EFFECTIVE, &net_priv)) + perror("cap_get_flag NET"); if (cap_free(caps)) perror("cap_free"); - return (sysadmin == CAP_SET); + return bpf_priv && perfmon_priv && net_priv == CAP_SET; } static void get_unpriv_disabled() diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index 2d752c4f8d9d..7629a0cebb9b 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -19,7 +19,7 @@ BPF_MOV64_IMM(BPF_REG_0, 2), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 1, @@ -315,7 +315,7 @@ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "allowed for root only", + .errstr_unpriv = "allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = POINTER_VALUE, @@ -346,7 +346,7 @@ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_2), BPF_EXIT_INSN(), }, - .errstr_unpriv = "allowed for root only", + .errstr_unpriv = "allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = TEST_DATA_LEN + TEST_DATA_LEN - ETH_HLEN - ETH_HLEN, @@ -397,7 +397,7 @@ BPF_MOV64_IMM(BPF_REG_0, 1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .fixup_map_hash_48b = { 3 }, .result_unpriv = REJECT, .result = ACCEPT, @@ -1064,7 +1064,7 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .errstr_unpriv = "allowed for root only", + .errstr_unpriv = "allowed for", .result_unpriv = REJECT, .errstr = "R0 !read_ok", .result = REJECT, @@ -1977,7 +1977,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, }, @@ -2003,7 +2003,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .errstr = "!read_ok", .result = REJECT, }, @@ -2028,7 +2028,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .errstr = "!read_ok", .result = REJECT, }, diff --git a/tools/testing/selftests/bpf/verifier/dead_code.c b/tools/testing/selftests/bpf/verifier/dead_code.c index 50a8a63be4ac..5cf361d8eb1c 100644 --- a/tools/testing/selftests/bpf/verifier/dead_code.c +++ b/tools/testing/selftests/bpf/verifier/dead_code.c @@ -85,7 +85,7 @@ BPF_MOV64_IMM(BPF_REG_0, 12), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, @@ -103,7 +103,7 @@ BPF_MOV64_IMM(BPF_REG_0, 12), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, @@ -121,7 +121,7 @@ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -5), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, @@ -137,7 +137,7 @@ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, @@ -152,7 +152,7 @@ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, |