summaryrefslogtreecommitdiff
path: root/drivers/net
diff options
context:
space:
mode:
authorKrishna Kumar <krikku@gmail.com>2025-05-20 20:06:56 +0300
committerTony Nguyen <anthony.l.nguyen@intel.com>2025-06-17 20:09:18 +0300
commit5d3bc9e5e725aa36cca9b794e340057feb6880b4 (patch)
tree6cef530db88e3d79cb6a2cbbcfbb62b9ee264318 /drivers/net
parent7b4ac12cc929e281cf7edc22203e0533790ebc2b (diff)
downloadlinux-5d3bc9e5e725aa36cca9b794e340057feb6880b4.tar.xz
net: ice: Perform accurate aRFS flow match
This patch fixes an issue seen in a large-scale deployment under heavy incoming pkts where the aRFS flow wrongly matches a flow and reprograms the NIC with wrong settings. That mis-steering causes RX-path latency spikes and noisy neighbor effects when many connections collide on the same hash (some of our production servers have 20-30K connections). set_rps_cpu() calls ndo_rx_flow_steer() with flow_id that is calculated by hashing the skb sized by the per rx-queue table size. This results in multiple connections (even across different rx-queues) getting the same hash value. The driver steer function modifies the wrong flow to use this rx-queue, e.g.: Flow#1 is first added: Flow#1: <ip1, port1, ip2, port2>, Hash 'h', q#10 Later when a new flow needs to be added: Flow#2: <ip3, port3, ip4, port4>, Hash 'h', q#20 The driver finds the hash 'h' from Flow#1 and updates it to use q#20. This results in both flows getting un-optimized - packets for Flow#1 goes to q#20, and then reprogrammed back to q#10 later and so on; and Flow #2 programming is never done as Flow#1 is matched first for all misses. Many flows may wrongly share the same hash and reprogram rules of the original flow each with their own q#. Tested on two 144-core servers with 16K netperf sessions for 180s. Netperf clients are pinned to cores 0-71 sequentially (so that wrong packets on q#s 72-143 can be measured). IRQs are set 1:1 for queues -> CPUs, enable XPS, enable aRFS (global value is 144 * rps_flow_cnt). Test notes about results from ice_rx_flow_steer(): --------------------------------------------------- 1. "Skip:" counter increments here: if (fltr_info->q_index == rxq_idx || arfs_entry->fltr_state != ICE_ARFS_ACTIVE) goto out; 2. "Add:" counter increments here: ret = arfs_entry->fltr_info.fltr_id; INIT_HLIST_NODE(&arfs_entry->list_entry); 3. "Update:" counter increments here: /* update the queue to forward to on an already existing flow */ Runtime comparison: original code vs with the patch for different rps_flow_cnt values. +-------------------------------+--------------+--------------+ | rps_flow_cnt | 512 | 2048 | +-------------------------------+--------------+--------------+ | Ratio of Pkts on Good:Bad q's | 214 vs 822K | 1.1M vs 980K | | Avoid wrong aRFS programming | 0 vs 310K | 0 vs 30K | | CPU User | 216 vs 183 | 216 vs 206 | | CPU System | 1441 vs 1171 | 1447 vs 1320 | | CPU Softirq | 1245 vs 920 | 1238 vs 961 | | CPU Total | 29 vs 22.7 | 29 vs 24.9 | | aRFS Update | 533K vs 59 | 521K vs 32 | | aRFS Skip | 82M vs 77M | 7.2M vs 4.5M | +-------------------------------+--------------+--------------+ A separate TCP_STREAM and TCP_RR with 1,4,8,16,64,128,256,512 connections showed no performance degradation. Some points on the patch/aRFS behavior: 1. Enabling full tuple matching ensures flows are always correctly matched, even with smaller hash sizes. 2. 5-6% drop in CPU utilization as the packets arrive at the correct CPUs and fewer calls to driver for programming on misses. 3. Larger hash tables reduces mis-steering due to more unique flow hashes, but still has clashes. However, with larger per-device rps_flow_cnt, old flows take more time to expire and new aRFS flows cannot be added if h/w limits are reached (rps_may_expire_flow() succeeds when 10*rps_flow_cnt pkts have been processed by this cpu that are not part of the flow). Fixes: 28bf26724fdb0 ("ice: Implement aRFS") Signed-off-by: Krishna Kumar <krikku@gmail.com> Reviewed-by: Simon Horman <horms@kernel.org> Tested-by: Rinitha S <sx.rinitha@intel.com> (A Contingent worker at Intel) Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Diffstat (limited to 'drivers/net')
-rw-r--r--drivers/net/ethernet/intel/ice/ice_arfs.c48
1 files changed, 48 insertions, 0 deletions
diff --git a/drivers/net/ethernet/intel/ice/ice_arfs.c b/drivers/net/ethernet/intel/ice/ice_arfs.c
index 2bc5c7f59844..1f7834c03550 100644
--- a/drivers/net/ethernet/intel/ice/ice_arfs.c
+++ b/drivers/net/ethernet/intel/ice/ice_arfs.c
@@ -378,6 +378,50 @@ ice_arfs_is_perfect_flow_set(struct ice_hw *hw, __be16 l3_proto, u8 l4_proto)
}
/**
+ * ice_arfs_cmp - Check if aRFS filter matches this flow.
+ * @fltr_info: filter info of the saved ARFS entry.
+ * @fk: flow dissector keys.
+ * @n_proto: One of htons(ETH_P_IP) or htons(ETH_P_IPV6).
+ * @ip_proto: One of IPPROTO_TCP or IPPROTO_UDP.
+ *
+ * Since this function assumes limited values for n_proto and ip_proto, it
+ * is meant to be called only from ice_rx_flow_steer().
+ *
+ * Return:
+ * * true - fltr_info refers to the same flow as fk.
+ * * false - fltr_info and fk refer to different flows.
+ */
+static bool
+ice_arfs_cmp(const struct ice_fdir_fltr *fltr_info, const struct flow_keys *fk,
+ __be16 n_proto, u8 ip_proto)
+{
+ /* Determine if the filter is for IPv4 or IPv6 based on flow_type,
+ * which is one of ICE_FLTR_PTYPE_NONF_IPV{4,6}_{TCP,UDP}.
+ */
+ bool is_v4 = fltr_info->flow_type == ICE_FLTR_PTYPE_NONF_IPV4_TCP ||
+ fltr_info->flow_type == ICE_FLTR_PTYPE_NONF_IPV4_UDP;
+
+ /* Following checks are arranged in the quickest and most discriminative
+ * fields first for early failure.
+ */
+ if (is_v4)
+ return n_proto == htons(ETH_P_IP) &&
+ fltr_info->ip.v4.src_port == fk->ports.src &&
+ fltr_info->ip.v4.dst_port == fk->ports.dst &&
+ fltr_info->ip.v4.src_ip == fk->addrs.v4addrs.src &&
+ fltr_info->ip.v4.dst_ip == fk->addrs.v4addrs.dst &&
+ fltr_info->ip.v4.proto == ip_proto;
+
+ return fltr_info->ip.v6.src_port == fk->ports.src &&
+ fltr_info->ip.v6.dst_port == fk->ports.dst &&
+ fltr_info->ip.v6.proto == ip_proto &&
+ !memcmp(&fltr_info->ip.v6.src_ip, &fk->addrs.v6addrs.src,
+ sizeof(struct in6_addr)) &&
+ !memcmp(&fltr_info->ip.v6.dst_ip, &fk->addrs.v6addrs.dst,
+ sizeof(struct in6_addr));
+}
+
+/**
* ice_rx_flow_steer - steer the Rx flow to where application is being run
* @netdev: ptr to the netdev being adjusted
* @skb: buffer with required header information
@@ -448,6 +492,10 @@ ice_rx_flow_steer(struct net_device *netdev, const struct sk_buff *skb,
continue;
fltr_info = &arfs_entry->fltr_info;
+
+ if (!ice_arfs_cmp(fltr_info, &fk, n_proto, ip_proto))
+ continue;
+
ret = fltr_info->fltr_id;
if (fltr_info->q_index == rxq_idx ||