summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2026-03-02 21:14:29 +0300
committerJakub Kicinski <kuba@kernel.org>2026-03-05 03:54:09 +0300
commitdd378109d20ff6789091fa3558607c1d242d80ad (patch)
tree45595854374fd90b3474ed54673e5b18b1683672 /net
parent9cde131cdd888873363b5d9dfd8d4d4c1fae6986 (diff)
downloadlinux-dd378109d20ff6789091fa3558607c1d242d80ad.tar.xz
net-sysfs: use rps_tag_ptr and remove metadata from rps_sock_flow_table
Instead of storing the @mask at the beginning of rps_sock_flow_table, use 5 low order bits of the rps_tag_ptr to store the log of the size. This removes a potential cache line miss to fetch @mask. More importantly, we can switch to vmalloc_huge() without wasting memory. Tested with: numactl --interleave=all bash -c "echo 4194304 >/proc/sys/net/core/rps_sock_flow_entries" Signed-off-by: Eric Dumazet <edumazet@google.com> Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com> Link: https://patch.msgid.link/20260302181432.1836150-5-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'net')
-rw-r--r--net/core/dev.c12
-rw-r--r--net/core/sysctl_net_core.c89
2 files changed, 54 insertions, 47 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index 92f8eeac8de3..7ae87be81afc 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5075,9 +5075,9 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_dev_flow **rflowp)
{
- const struct rps_sock_flow_table *sock_flow_table;
struct netdev_rx_queue *rxqueue = dev->_rx;
struct rps_dev_flow_table *flow_table;
+ rps_tag_ptr global_tag_ptr;
struct rps_map *map;
int cpu = -1;
u32 tcpu;
@@ -5108,8 +5108,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
if (!hash)
goto done;
- sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table);
- if (flow_table && sock_flow_table) {
+ global_tag_ptr = READ_ONCE(net_hotdata.rps_sock_flow_table);
+ if (flow_table && global_tag_ptr) {
+ struct rps_sock_flow_table *sock_flow_table;
struct rps_dev_flow *rflow;
u32 next_cpu;
u32 flow_id;
@@ -5118,8 +5119,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
/* First check into global flow table if there is a match.
* This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow().
*/
- flow_id = hash & rps_sock_flow_table_mask(sock_flow_table);
- ident = READ_ONCE(sock_flow_table->ents[flow_id]);
+ flow_id = hash & rps_tag_to_mask(global_tag_ptr);
+ sock_flow_table = rps_tag_to_table(global_tag_ptr);
+ ident = READ_ONCE(sock_flow_table[flow_id].ent);
if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask)
goto try_rps;
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index cfbe798493b5..502705e04649 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -138,68 +138,73 @@ done:
static int rps_sock_flow_sysctl(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
+ struct rps_sock_flow_table *o_sock_table, *sock_table;
+ static DEFINE_MUTEX(sock_flow_mutex);
+ rps_tag_ptr o_tag_ptr, tag_ptr;
unsigned int orig_size, size;
- int ret, i;
struct ctl_table tmp = {
.data = &size,
.maxlen = sizeof(size),
.mode = table->mode
};
- struct rps_sock_flow_table *o_sock_table, *sock_table;
- static DEFINE_MUTEX(sock_flow_mutex);
void *tofree = NULL;
+ int ret, i;
+ u8 log;
mutex_lock(&sock_flow_mutex);
- o_sock_table = rcu_dereference_protected(
- net_hotdata.rps_sock_flow_table,
- lockdep_is_held(&sock_flow_mutex));
- size = o_sock_table ? rps_sock_flow_table_mask(o_sock_table) + 1 : 0;
+ o_tag_ptr = tag_ptr = net_hotdata.rps_sock_flow_table;
+
+ size = o_tag_ptr ? rps_tag_to_mask(o_tag_ptr) + 1 : 0;
+ o_sock_table = rps_tag_to_table(o_tag_ptr);
orig_size = size;
ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
- if (write) {
- if (size) {
- if (size > 1<<29) {
- /* Enforce limit to prevent overflow */
+ if (!write)
+ goto unlock;
+
+ if (size) {
+ if (size > 1<<29) {
+ /* Enforce limit to prevent overflow */
+ mutex_unlock(&sock_flow_mutex);
+ return -EINVAL;
+ }
+ sock_table = o_sock_table;
+ size = roundup_pow_of_two(size);
+ if (size != orig_size) {
+ sock_table = vmalloc_huge(size * sizeof(*sock_table),
+ GFP_KERNEL);
+ if (!sock_table) {
mutex_unlock(&sock_flow_mutex);
- return -EINVAL;
- }
- sock_table = o_sock_table;
- size = roundup_pow_of_two(size);
- if (size != orig_size) {
- sock_table =
- vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size));
- if (!sock_table) {
- mutex_unlock(&sock_flow_mutex);
- return -ENOMEM;
- }
- net_hotdata.rps_cpu_mask =
- roundup_pow_of_two(nr_cpu_ids) - 1;
- sock_table->_mask = size - 1;
+ return -ENOMEM;
}
+ net_hotdata.rps_cpu_mask =
+ roundup_pow_of_two(nr_cpu_ids) - 1;
+ log = ilog2(size);
+ tag_ptr = (rps_tag_ptr)sock_table | log;
+ }
- for (i = 0; i < size; i++)
- sock_table->ents[i] = RPS_NO_CPU;
- } else
- sock_table = NULL;
-
- if (sock_table != o_sock_table) {
- rcu_assign_pointer(net_hotdata.rps_sock_flow_table,
- sock_table);
- if (sock_table) {
- static_branch_inc(&rps_needed);
- static_branch_inc(&rfs_needed);
- }
- if (o_sock_table) {
- static_branch_dec(&rps_needed);
- static_branch_dec(&rfs_needed);
- tofree = o_sock_table;
- }
+ for (i = 0; i < size; i++)
+ sock_table[i].ent = RPS_NO_CPU;
+ } else {
+ sock_table = NULL;
+ tag_ptr = 0UL;
+ }
+ if (tag_ptr != o_tag_ptr) {
+ smp_store_release(&net_hotdata.rps_sock_flow_table, tag_ptr);
+ if (sock_table) {
+ static_branch_inc(&rps_needed);
+ static_branch_inc(&rfs_needed);
+ }
+ if (o_sock_table) {
+ static_branch_dec(&rps_needed);
+ static_branch_dec(&rfs_needed);
+ tofree = o_sock_table;
}
}
+unlock:
mutex_unlock(&sock_flow_mutex);
kvfree_rcu_mightsleep(tofree);