net-sysfs: use rps_tag_ptr and remove metadata from rps_sock_flow_table

Instead of storing the @mask at the beginning of rps_sock_flow_table, use 5 low order bits of the rps_tag_ptr to store the log of the size. This removes a potential cache line miss to fetch @mask. More importantly, we can switch to vmalloc_huge() without wasting memory. Tested with: numactl --interleave=all bash -c "echo 4194304 >/proc/sys/net/core/rps_sock_flow_entries" Signed-off-by: Eric Dumazet <edumazet@google.com> Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com> Link: https://patch.msgid.link/20260302181432.1836150-5-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
author: Eric Dumazet <edumazet@google.com> 2026-03-02 21:14:29 +0300
committer: Jakub Kicinski <kuba@kernel.org> 2026-03-05 03:54:09 +0300
commit: dd378109d20ff6789091fa3558607c1d242d80ad (patch)
tree: 45595854374fd90b3474ed54673e5b18b1683672 /net
parent: 9cde131cdd888873363b5d9dfd8d4d4c1fae6986 (diff)
download: linux-dd378109d20ff6789091fa3558607c1d242d80ad.tar.xz
2 files changed, 54 insertions, 47 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index 92f8eeac8de3..7ae87be81afc 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5075,9 +5075,9 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 		       struct rps_dev_flow **rflowp)
 {
-	const struct rps_sock_flow_table *sock_flow_table;
 	struct netdev_rx_queue *rxqueue = dev->_rx;
 	struct rps_dev_flow_table *flow_table;
+	rps_tag_ptr global_tag_ptr;
 	struct rps_map *map;
 	int cpu = -1;
 	u32 tcpu;
@@ -5108,8 +5108,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 	if (!hash)
 		goto done;
 
-	sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table);
-	if (flow_table && sock_flow_table) {
+	global_tag_ptr = READ_ONCE(net_hotdata.rps_sock_flow_table);
+	if (flow_table && global_tag_ptr) {
+		struct rps_sock_flow_table *sock_flow_table;
 		struct rps_dev_flow *rflow;
 		u32 next_cpu;
 		u32 flow_id;
@@ -5118,8 +5119,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 		/* First check into global flow table if there is a match.
 		 * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow().
 		 */
-		flow_id = hash & rps_sock_flow_table_mask(sock_flow_table);
-		ident = READ_ONCE(sock_flow_table->ents[flow_id]);
+		flow_id = hash & rps_tag_to_mask(global_tag_ptr);
+		sock_flow_table = rps_tag_to_table(global_tag_ptr);
+		ident = READ_ONCE(sock_flow_table[flow_id].ent);
 		if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask)
 			goto try_rps;
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index cfbe798493b5..502705e04649 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -138,68 +138,73 @@ done:
 static int rps_sock_flow_sysctl(const struct ctl_table *table, int write,
 				void *buffer, size_t *lenp, loff_t *ppos)
 {
+	struct rps_sock_flow_table *o_sock_table, *sock_table;
+	static DEFINE_MUTEX(sock_flow_mutex);
+	rps_tag_ptr o_tag_ptr, tag_ptr;
 	unsigned int orig_size, size;
-	int ret, i;
 	struct ctl_table tmp = {
 		.data = &size,
 		.maxlen = sizeof(size),
 		.mode = table->mode
 	};
-	struct rps_sock_flow_table *o_sock_table, *sock_table;
-	static DEFINE_MUTEX(sock_flow_mutex);
 	void *tofree = NULL;
+	int ret, i;
+	u8 log;
 
 	mutex_lock(&sock_flow_mutex);
 
-	o_sock_table = rcu_dereference_protected(
-					net_hotdata.rps_sock_flow_table,
-					lockdep_is_held(&sock_flow_mutex));
-	size = o_sock_table ? rps_sock_flow_table_mask(o_sock_table) + 1 : 0;
+	o_tag_ptr = tag_ptr = net_hotdata.rps_sock_flow_table;
+
+	size = o_tag_ptr ? rps_tag_to_mask(o_tag_ptr) + 1 : 0;
+	o_sock_table = rps_tag_to_table(o_tag_ptr);
 	orig_size = size;
 
 	ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
 
-	if (write) {
-		if (size) {
-			if (size > 1<<29) {
-				/* Enforce limit to prevent overflow */
+	if (!write)
+		goto unlock;
+
+	if (size) {
+		if (size > 1<<29) {
+			/* Enforce limit to prevent overflow */
+			mutex_unlock(&sock_flow_mutex);
+			return -EINVAL;
+		}
+		sock_table = o_sock_table;
+		size = roundup_pow_of_two(size);
+		if (size != orig_size) {
+			sock_table = vmalloc_huge(size * sizeof(*sock_table),
+						  GFP_KERNEL);
+			if (!sock_table) {
 				mutex_unlock(&sock_flow_mutex);
-				return -EINVAL;
-			}
-			sock_table = o_sock_table;
-			size = roundup_pow_of_two(size);
-			if (size != orig_size) {
-				sock_table =
-				    vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size));
-				if (!sock_table) {
-					mutex_unlock(&sock_flow_mutex);
-					return -ENOMEM;
-				}
-				net_hotdata.rps_cpu_mask =
-					roundup_pow_of_two(nr_cpu_ids) - 1;
-				sock_table->_mask = size - 1;
+				return -ENOMEM;
 			}
+			net_hotdata.rps_cpu_mask =
+				roundup_pow_of_two(nr_cpu_ids) - 1;
+			log = ilog2(size);
+			tag_ptr = (rps_tag_ptr)sock_table | log;
+		}
 
-			for (i = 0; i < size; i++)
-				sock_table->ents[i] = RPS_NO_CPU;
-		} else
-			sock_table = NULL;
-
-		if (sock_table != o_sock_table) {
-			rcu_assign_pointer(net_hotdata.rps_sock_flow_table,
-					   sock_table);
-			if (sock_table) {
-				static_branch_inc(&rps_needed);
-				static_branch_inc(&rfs_needed);
-			}
-			if (o_sock_table) {
-				static_branch_dec(&rps_needed);
-				static_branch_dec(&rfs_needed);
-				tofree = o_sock_table;
-			}
+		for (i = 0; i < size; i++)
+			sock_table[i].ent = RPS_NO_CPU;
+	} else {
+		sock_table = NULL;
+		tag_ptr = 0UL;
+	}
+	if (tag_ptr != o_tag_ptr) {
+		smp_store_release(&net_hotdata.rps_sock_flow_table, tag_ptr);
+		if (sock_table) {
+			static_branch_inc(&rps_needed);
+			static_branch_inc(&rfs_needed);
+		}
+		if (o_sock_table) {
+			static_branch_dec(&rps_needed);
+			static_branch_dec(&rfs_needed);
+			tofree = o_sock_table;
 		}
 	}
 
+unlock:
 	mutex_unlock(&sock_flow_mutex);
 
 	kvfree_rcu_mightsleep(tofree);
author	Eric Dumazet <edumazet@google.com>	2026-03-02 21:14:29 +0300
committer	Jakub Kicinski <kuba@kernel.org>	2026-03-05 03:54:09 +0300
commit	dd378109d20ff6789091fa3558607c1d242d80ad (patch)
tree	45595854374fd90b3474ed54673e5b18b1683672 /net
parent	9cde131cdd888873363b5d9dfd8d4d4c1fae6986 (diff)
download	linux-dd378109d20ff6789091fa3558607c1d242d80ad.tar.xz