From f9b53bb13923f0a6ed2e784a35301cfb4c28f4f4 Mon Sep 17 00:00:00 2001 From: Song Yoong Siang Date: Sun, 16 Feb 2025 17:34:29 +0800 Subject: igc: Refactor empty frame insertion for launch time support Refactor the code for inserting an empty frame into a new function igc_insert_empty_frame(). This change extracts the logic for inserting an empty packet from igc_xmit_frame_ring() into a separate function, allowing it to be reused in future implementations, such as the XDP zero copy transmit function. Remove the igc_desc_unused() checking in igc_init_tx_empty_descriptor() because the number of descriptors needed is guaranteed. Ensure that skb allocation and DMA mapping work for the empty frame, before proceeding to fill in igc_tx_buffer info, context descriptor, and data descriptor. Rate limit the error messages for skb allocation and DMA mapping failures. Update the comment to indicate that the 2 descriptors needed by the empty frame are already taken into consideration in igc_xmit_frame_ring(). Handle the case where the insertion of an empty frame fails and explain the reason behind this handling. Signed-off-by: Song Yoong Siang Signed-off-by: Martin KaFai Lau Reviewed-by: Faizal Rahim Reviewed-by: Maciej Fijalkowski Link: https://patch.msgid.link/20250216093430.957880-5-yoong.siang.song@intel.com --- drivers/net/ethernet/intel/igc/igc_main.c | 82 +++++++++++++++++++------------ 1 file changed, 50 insertions(+), 32 deletions(-) (limited to 'drivers/net/ethernet/intel/igc/igc_main.c') diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 84307bb7313e..1bfa71545e37 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -1092,7 +1092,8 @@ static int igc_init_empty_frame(struct igc_ring *ring, dma = dma_map_single(ring->dev, skb->data, size, DMA_TO_DEVICE); if (dma_mapping_error(ring->dev, dma)) { - netdev_err_once(ring->netdev, "Failed to map DMA for TX\n"); + net_err_ratelimited("%s: DMA mapping error for empty frame\n", + netdev_name(ring->netdev)); return -ENOMEM; } @@ -1108,20 +1109,12 @@ static int igc_init_empty_frame(struct igc_ring *ring, return 0; } -static int igc_init_tx_empty_descriptor(struct igc_ring *ring, - struct sk_buff *skb, - struct igc_tx_buffer *first) +static void igc_init_tx_empty_descriptor(struct igc_ring *ring, + struct sk_buff *skb, + struct igc_tx_buffer *first) { union igc_adv_tx_desc *desc; u32 cmd_type, olinfo_status; - int err; - - if (!igc_desc_unused(ring)) - return -EBUSY; - - err = igc_init_empty_frame(ring, first, skb); - if (err) - return err; cmd_type = IGC_ADVTXD_DTYP_DATA | IGC_ADVTXD_DCMD_DEXT | IGC_ADVTXD_DCMD_IFCS | IGC_TXD_DCMD | @@ -1140,8 +1133,6 @@ static int igc_init_tx_empty_descriptor(struct igc_ring *ring, ring->next_to_use++; if (ring->next_to_use == ring->count) ring->next_to_use = 0; - - return 0; } #define IGC_EMPTY_FRAME_SIZE 60 @@ -1567,6 +1558,40 @@ static bool igc_request_tx_tstamp(struct igc_adapter *adapter, struct sk_buff *s return false; } +static int igc_insert_empty_frame(struct igc_ring *tx_ring) +{ + struct igc_tx_buffer *empty_info; + struct sk_buff *empty_skb; + void *data; + int ret; + + empty_info = &tx_ring->tx_buffer_info[tx_ring->next_to_use]; + empty_skb = alloc_skb(IGC_EMPTY_FRAME_SIZE, GFP_ATOMIC); + if (unlikely(!empty_skb)) { + net_err_ratelimited("%s: skb alloc error for empty frame\n", + netdev_name(tx_ring->netdev)); + return -ENOMEM; + } + + data = skb_put(empty_skb, IGC_EMPTY_FRAME_SIZE); + memset(data, 0, IGC_EMPTY_FRAME_SIZE); + + /* Prepare DMA mapping and Tx buffer information */ + ret = igc_init_empty_frame(tx_ring, empty_info, empty_skb); + if (unlikely(ret)) { + dev_kfree_skb_any(empty_skb); + return ret; + } + + /* Prepare advanced context descriptor for empty packet */ + igc_tx_ctxtdesc(tx_ring, 0, false, 0, 0, 0); + + /* Prepare advanced data descriptor for empty packet */ + igc_init_tx_empty_descriptor(tx_ring, empty_skb, empty_info); + + return 0; +} + static netdev_tx_t igc_xmit_frame_ring(struct sk_buff *skb, struct igc_ring *tx_ring) { @@ -1586,6 +1611,7 @@ static netdev_tx_t igc_xmit_frame_ring(struct sk_buff *skb, * + 1 desc for skb_headlen/IGC_MAX_DATA_PER_TXD, * + 2 desc gap to keep tail from touching head, * + 1 desc for context descriptor, + * + 2 desc for inserting an empty packet for launch time, * otherwise try next time */ for (f = 0; f < skb_shinfo(skb)->nr_frags; f++) @@ -1605,24 +1631,16 @@ static netdev_tx_t igc_xmit_frame_ring(struct sk_buff *skb, launch_time = igc_tx_launchtime(tx_ring, txtime, &first_flag, &insert_empty); if (insert_empty) { - struct igc_tx_buffer *empty_info; - struct sk_buff *empty; - void *data; - - empty_info = &tx_ring->tx_buffer_info[tx_ring->next_to_use]; - empty = alloc_skb(IGC_EMPTY_FRAME_SIZE, GFP_ATOMIC); - if (!empty) - goto done; - - data = skb_put(empty, IGC_EMPTY_FRAME_SIZE); - memset(data, 0, IGC_EMPTY_FRAME_SIZE); - - igc_tx_ctxtdesc(tx_ring, 0, false, 0, 0, 0); - - if (igc_init_tx_empty_descriptor(tx_ring, - empty, - empty_info) < 0) - dev_kfree_skb_any(empty); + /* Reset the launch time if the required empty frame fails to + * be inserted. However, this packet is not dropped, so it + * "dirties" the current Qbv cycle. This ensures that the + * upcoming packet, which is scheduled in the next Qbv cycle, + * does not require an empty frame. This way, the launch time + * continues to function correctly despite the current failure + * to insert the empty frame. + */ + if (igc_insert_empty_frame(tx_ring)) + launch_time = 0; } done: -- cgit v1.2.3 From d7c3a7ff75028bea9e4879bd3dcd04a6dc3e33c8 Mon Sep 17 00:00:00 2001 From: Song Yoong Siang Date: Sun, 16 Feb 2025 17:34:30 +0800 Subject: igc: Add launch time support to XDP ZC Enable Launch Time Control (LTC) support for XDP zero copy via XDP Tx metadata framework. This patch has been tested with tools/testing/selftests/bpf/xdp_hw_metadata on Intel I225-LM Ethernet controller. Below are the test steps and result. Test 1: Send a single packet with the launch time set to 1 s in the future. Test steps: 1. On the DUT, start the xdp_hw_metadata selftest application: $ sudo ./xdp_hw_metadata enp2s0 -l 1000000000 -L 1 2. On the Link Partner, send a UDP packet with VLAN priority 1 to port 9091 of the DUT. Result: When the launch time is set to 1 s in the future, the delta between the launch time and the transmit hardware timestamp is 0.016 us, as shown in printout of the xdp_hw_metadata application below. 0x562ff5dc8880: rx_desc[4]->addr=84110 addr=84110 comp_addr=84110 EoP rx_hash: 0xE343384 with RSS type:0x1 HW RX-time: 1734578015467548904 (sec:1734578015.4675) delta to User RX-time sec:0.0002 (183.103 usec) XDP RX-time: 1734578015467651698 (sec:1734578015.4677) delta to User RX-time sec:0.0001 (80.309 usec) No rx_vlan_tci or rx_vlan_proto, err=-95 0x562ff5dc8880: ping-pong with csum=561c (want c7dd) csum_start=34 csum_offset=6 HW RX-time: 1734578015467548904 (sec:1734578015.4675) delta to HW Launch-time sec:1.0000 (1000000.000 usec) 0x562ff5dc8880: complete tx idx=4 addr=4018 HW Launch-time: 1734578016467548904 (sec:1734578016.4675) delta to HW TX-complete-time sec:0.0000 (0.016 usec) HW TX-complete-time: 1734578016467548920 (sec:1734578016.4675) delta to User TX-complete-time sec:0.0000 (32.546 usec) XDP RX-time: 1734578015467651698 (sec:1734578015.4677) delta to User TX-complete-time sec:0.9999 (999929.768 usec) HW RX-time: 1734578015467548904 (sec:1734578015.4675) delta to HW TX-complete-time sec:1.0000 (1000000.016 usec) 0x562ff5dc8880: complete rx idx=132 addr=84110 Test 2: Send 1000 packets with a 10 ms interval and the launch time set to 500 us in the future. Test steps: 1. On the DUT, start the xdp_hw_metadata selftest application: $ sudo chrt -f 99 ./xdp_hw_metadata enp2s0 -l 500000 -L 1 > \ /dev/shm/result.log 2. On the Link Partner, send 1000 UDP packets with a 10 ms interval and VLAN priority 1 to port 9091 of the DUT. Result: When the launch time is set to 500 us in the future, the average delta between the launch time and the transmit hardware timestamp is 0.016 us, as shown in the analysis of /dev/shm/result.log below. The XDP launch time works correctly in sending 1000 packets continuously. Min delta: 0.005 us Avr delta: 0.016 us Max delta: 0.031 us Total packets forwarded: 1000 Signed-off-by: Song Yoong Siang Signed-off-by: Martin KaFai Lau Reviewed-by: Faizal Rahim Reviewed-by: Maciej Fijalkowski Link: https://patch.msgid.link/20250216093430.957880-6-yoong.siang.song@intel.com --- drivers/net/ethernet/intel/igc/igc.h | 1 + drivers/net/ethernet/intel/igc/igc_main.c | 61 ++++++++++++++++++++++++++++++- 2 files changed, 60 insertions(+), 2 deletions(-) (limited to 'drivers/net/ethernet/intel/igc/igc_main.c') diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h index b8111ad9a9a8..cd1d7b6c1782 100644 --- a/drivers/net/ethernet/intel/igc/igc.h +++ b/drivers/net/ethernet/intel/igc/igc.h @@ -579,6 +579,7 @@ struct igc_metadata_request { struct xsk_tx_metadata *meta; struct igc_ring *tx_ring; u32 cmd_type; + u16 used_desc; }; struct igc_q_vector { diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 1bfa71545e37..3044392e8ded 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -2971,9 +2971,48 @@ static u64 igc_xsk_fill_timestamp(void *_priv) return *(u64 *)_priv; } +static void igc_xsk_request_launch_time(u64 launch_time, void *_priv) +{ + struct igc_metadata_request *meta_req = _priv; + struct igc_ring *tx_ring = meta_req->tx_ring; + __le32 launch_time_offset; + bool insert_empty = false; + bool first_flag = false; + u16 used_desc = 0; + + if (!tx_ring->launchtime_enable) + return; + + launch_time_offset = igc_tx_launchtime(tx_ring, + ns_to_ktime(launch_time), + &first_flag, &insert_empty); + if (insert_empty) { + /* Disregard the launch time request if the required empty frame + * fails to be inserted. + */ + if (igc_insert_empty_frame(tx_ring)) + return; + + meta_req->tx_buffer = + &tx_ring->tx_buffer_info[tx_ring->next_to_use]; + /* Inserting an empty packet requires two descriptors: + * one data descriptor and one context descriptor. + */ + used_desc += 2; + } + + /* Use one context descriptor to specify launch time and first flag. */ + igc_tx_ctxtdesc(tx_ring, launch_time_offset, first_flag, 0, 0, 0); + used_desc += 1; + + /* Update the number of used descriptors in this request */ + meta_req->used_desc += used_desc; +} + const struct xsk_tx_metadata_ops igc_xsk_tx_metadata_ops = { .tmo_request_timestamp = igc_xsk_request_timestamp, .tmo_fill_timestamp = igc_xsk_fill_timestamp, + .tmo_request_launch_time = igc_xsk_request_launch_time, }; static void igc_xdp_xmit_zc(struct igc_ring *ring) @@ -2996,7 +3035,13 @@ static void igc_xdp_xmit_zc(struct igc_ring *ring) ntu = ring->next_to_use; budget = igc_desc_unused(ring); - while (xsk_tx_peek_desc(pool, &xdp_desc) && budget--) { + /* Packets with launch time require one data descriptor and one context + * descriptor. When the launch time falls into the next Qbv cycle, we + * may need to insert an empty packet, which requires two more + * descriptors. Therefore, to be safe, we always ensure we have at least + * 4 descriptors available. + */ + while (xsk_tx_peek_desc(pool, &xdp_desc) && budget >= 4) { struct igc_metadata_request meta_req; struct xsk_tx_metadata *meta = NULL; struct igc_tx_buffer *bi; @@ -3017,9 +3062,19 @@ static void igc_xdp_xmit_zc(struct igc_ring *ring) meta_req.tx_ring = ring; meta_req.tx_buffer = bi; meta_req.meta = meta; + meta_req.used_desc = 0; xsk_tx_metadata_request(meta, &igc_xsk_tx_metadata_ops, &meta_req); + /* xsk_tx_metadata_request() may have updated next_to_use */ + ntu = ring->next_to_use; + + /* xsk_tx_metadata_request() may have updated Tx buffer info */ + bi = meta_req.tx_buffer; + + /* xsk_tx_metadata_request() may use a few descriptors */ + budget -= meta_req.used_desc; + tx_desc = IGC_TX_DESC(ring, ntu); tx_desc->read.cmd_type_len = cpu_to_le32(meta_req.cmd_type); tx_desc->read.olinfo_status = cpu_to_le32(olinfo_status); @@ -3037,9 +3092,11 @@ static void igc_xdp_xmit_zc(struct igc_ring *ring) ntu++; if (ntu == ring->count) ntu = 0; + + ring->next_to_use = ntu; + budget--; } - ring->next_to_use = ntu; if (tx_desc) { igc_flush_tx_descriptors(ring); xsk_tx_release(pool); -- cgit v1.2.3 From e6116fc605574bb58c2016938ff24a7fbafe6e2a Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 24 Feb 2025 21:33:55 -0500 Subject: net: skb: free up one bit in tx_flags The linked series wants to add skb tx completion timestamps. That needs a bit in skb_shared_info.tx_flags, but all are in use. A per-skb bit is only needed for features that are configured on a per packet basis. Per socket features can be read from sk->sk_tsflags. Per packet tsflags can be set in sendmsg using cmsg, but only those in SOF_TIMESTAMPING_TX_RECORD_MASK. Per packet tsflags can also be set without cmsg by sandwiching a send inbetween two setsockopts: val |= SOF_TIMESTAMPING_$FEATURE; setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, &val, sizeof(val)); write(fd, buf, sz); val &= ~SOF_TIMESTAMPING_$FEATURE; setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, &val, sizeof(val)); Changing a datapath test from skb_shinfo(skb)->tx_flags to skb->sk->sk_tsflags can change behavior in that case, as the tx_flags is written before the second setsockopt updates sk_tsflags. Therefore, only bits can be reclaimed that cannot be set by cmsg and are also highly unlikely to be used to target individual packets otherwise. Free up the bit currently used for SKBTX_HW_TSTAMP_USE_CYCLES. This selects between clock and free running counter source for HW TX timestamps. It is probable that all packets of the same socket will always use the same source. Link: https://lore.kernel.org/netdev/cover.1739988644.git.pav@iki.fi/ Signed-off-by: Willem de Bruijn Reviewed-by: Jason Xing Reviewed-by: Gerhard Engleder Link: https://patch.msgid.link/20250225023416.2088705-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/engleder/tsnep_main.c | 4 ++-- drivers/net/ethernet/intel/igc/igc_main.c | 3 ++- include/linux/skbuff.h | 5 ++--- net/socket.c | 11 +---------- 4 files changed, 7 insertions(+), 16 deletions(-) (limited to 'drivers/net/ethernet/intel/igc/igc_main.c') diff --git a/drivers/net/ethernet/engleder/tsnep_main.c b/drivers/net/ethernet/engleder/tsnep_main.c index 0d030cb0b21c..3de4cb06e266 100644 --- a/drivers/net/ethernet/engleder/tsnep_main.c +++ b/drivers/net/ethernet/engleder/tsnep_main.c @@ -852,8 +852,8 @@ static bool tsnep_tx_poll(struct tsnep_tx *tx, int napi_budget) struct skb_shared_hwtstamps hwtstamps; u64 timestamp; - if (skb_shinfo(entry->skb)->tx_flags & - SKBTX_HW_TSTAMP_USE_CYCLES) + if (entry->skb->sk && + READ_ONCE(entry->skb->sk->sk_tsflags) & SOF_TIMESTAMPING_BIND_PHC) timestamp = __le64_to_cpu(entry->desc_wb->counter); else diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 3044392e8ded..472f009630c9 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -1668,7 +1668,8 @@ done: if (igc_request_tx_tstamp(adapter, skb, &tstamp_flags)) { skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; tx_flags |= IGC_TX_FLAGS_TSTAMP | tstamp_flags; - if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_USE_CYCLES) + if (skb->sk && + READ_ONCE(skb->sk->sk_tsflags) & SOF_TIMESTAMPING_BIND_PHC) tx_flags |= IGC_TX_FLAGS_TSTAMP_TIMER_1; } else { adapter->tx_hwtstamp_skipped++; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f2bb8473d99a..171aa15f6541 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -478,8 +478,8 @@ enum { /* device driver is going to provide hardware time stamp */ SKBTX_IN_PROGRESS = 1 << 2, - /* generate hardware time stamp based on cycles if supported */ - SKBTX_HW_TSTAMP_USE_CYCLES = 1 << 3, + /* reserved */ + SKBTX_RESERVED = 1 << 3, /* generate wifi status information (where possible) */ SKBTX_WIFI_STATUS = 1 << 4, @@ -500,7 +500,6 @@ enum { SKBTX_SCHED_TSTAMP | \ SKBTX_BPF) #define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | \ - SKBTX_HW_TSTAMP_USE_CYCLES | \ SKBTX_ANY_SW_TSTAMP) /* Definitions for flags in struct skb_shared_info */ diff --git a/net/socket.c b/net/socket.c index 0545e9ea7058..b64ecf2722e7 100644 --- a/net/socket.c +++ b/net/socket.c @@ -680,18 +680,9 @@ void __sock_tx_timestamp(__u32 tsflags, __u8 *tx_flags) { u8 flags = *tx_flags; - if (tsflags & SOF_TIMESTAMPING_TX_HARDWARE) { + if (tsflags & SOF_TIMESTAMPING_TX_HARDWARE) flags |= SKBTX_HW_TSTAMP_NOBPF; - /* PTP hardware clocks can provide a free running cycle counter - * as a time base for virtual clocks. Tell driver to use the - * free running cycle counter for timestamp if socket is bound - * to virtual clock. - */ - if (tsflags & SOF_TIMESTAMPING_BIND_PHC) - flags |= SKBTX_HW_TSTAMP_USE_CYCLES; - } - if (tsflags & SOF_TIMESTAMPING_TX_SOFTWARE) flags |= SKBTX_SW_TSTAMP; -- cgit v1.2.3