diff options
author | Jakub Kicinski <kuba@kernel.org> | 2023-02-08 05:20:02 +0300 |
---|---|---|
committer | Jakub Kicinski <kuba@kernel.org> | 2023-02-08 05:20:03 +0300 |
commit | cc74ca303a658dc4fb69e75df5f79d03d7a9b7e5 (patch) | |
tree | e446248b006239a53bc21ef49dc7c75262be8870 | |
parent | 383d9f87a06dd923c4fd0fdcb65b58258851f545 (diff) | |
parent | 2ac4980c57f54db7c5b416f7946d2921fc16d9d2 (diff) | |
download | linux-cc74ca303a658dc4fb69e75df5f79d03d7a9b7e5.tar.xz |
Merge branch 'sched-cpumask-improve-on-cpumask_local_spread-locality'
Yury Norov says:
====================
sched: cpumask: improve on cpumask_local_spread() locality
cpumask_local_spread() currently checks local node for presence of i'th
CPU, and then if it finds nothing makes a flat search among all non-local
CPUs. We can do it better by checking CPUs per NUMA hops.
This has significant performance implications on NUMA machines, for example
when using NUMA-aware allocated memory together with NUMA-aware IRQ
affinity hints.
Performance tests from patch 8 of this series for mellanox network
driver show:
TCP multi-stream, using 16 iperf3 instances pinned to 16 cores (with aRFS on).
Active cores: 64,65,72,73,80,81,88,89,96,97,104,105,112,113,120,121
+-------------------------+-----------+------------------+------------------+
| | BW (Gbps) | TX side CPU util | RX side CPU util |
+-------------------------+-----------+------------------+------------------+
| Baseline | 52.3 | 6.4 % | 17.9 % |
+-------------------------+-----------+------------------+------------------+
| Applied on TX side only | 52.6 | 5.2 % | 18.5 % |
+-------------------------+-----------+------------------+------------------+
| Applied on RX side only | 94.9 | 11.9 % | 27.2 % |
+-------------------------+-----------+------------------+------------------+
| Applied on both sides | 95.1 | 8.4 % | 27.3 % |
+-------------------------+-----------+------------------+------------------+
Bottleneck in RX side is released, reached linerate (~1.8x speedup).
~30% less cpu util on TX.
====================
Link: https://lore.kernel.org/r/20230121042436.2661843-1-yury.norov@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/eq.c | 18 | ||||
-rw-r--r-- | include/linux/cpumask.h | 20 | ||||
-rw-r--r-- | include/linux/find.h | 33 | ||||
-rw-r--r-- | include/linux/topology.h | 33 | ||||
-rw-r--r-- | kernel/sched/topology.c | 90 | ||||
-rw-r--r-- | lib/cpumask.c | 52 | ||||
-rw-r--r-- | lib/find_bit.c | 9 |
7 files changed, 230 insertions, 25 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index 9b44557e7271..66ec7932f008 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -817,9 +817,12 @@ static void comp_irqs_release(struct mlx5_core_dev *dev) static int comp_irqs_request(struct mlx5_core_dev *dev) { struct mlx5_eq_table *table = dev->priv.eq_table; + const struct cpumask *prev = cpu_none_mask; + const struct cpumask *mask; int ncomp_eqs = table->num_comp_eqs; u16 *cpus; int ret; + int cpu; int i; ncomp_eqs = table->num_comp_eqs; @@ -838,8 +841,19 @@ static int comp_irqs_request(struct mlx5_core_dev *dev) ret = -ENOMEM; goto free_irqs; } - for (i = 0; i < ncomp_eqs; i++) - cpus[i] = cpumask_local_spread(i, dev->priv.numa_node); + + i = 0; + rcu_read_lock(); + for_each_numa_hop_mask(mask, dev->priv.numa_node) { + for_each_cpu_andnot(cpu, mask, prev) { + cpus[i] = cpu; + if (++i == ncomp_eqs) + goto spread_done; + } + prev = mask; + } +spread_done: + rcu_read_unlock(); ret = mlx5_irqs_request_vectors(dev, cpus, ncomp_eqs, table->comp_irqs); kfree(cpus); if (ret < 0) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index c2aa0aa26b45..7b16aede7ac5 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -391,6 +391,26 @@ unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1, nr_cpumask_bits, cpumask_check(cpu)); } +/** + * cpumask_nth_and_andnot - get the Nth cpu set in 1st and 2nd cpumask, and clear in 3rd. + * @srcp1: the cpumask pointer + * @srcp2: the cpumask pointer + * @srcp3: the cpumask pointer + * @cpu: the N'th cpu to find, starting from 0 + * + * Returns >= nr_cpu_ids if such cpu doesn't exist. + */ +static __always_inline +unsigned int cpumask_nth_and_andnot(unsigned int cpu, const struct cpumask *srcp1, + const struct cpumask *srcp2, + const struct cpumask *srcp3) +{ + return find_nth_and_andnot_bit(cpumask_bits(srcp1), + cpumask_bits(srcp2), + cpumask_bits(srcp3), + nr_cpumask_bits, cpumask_check(cpu)); +} + #define CPU_BITS_NONE \ { \ [0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL \ diff --git a/include/linux/find.h b/include/linux/find.h index ccaf61a0f5fd..4647864a5ffd 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -22,6 +22,9 @@ unsigned long __find_nth_and_bit(const unsigned long *addr1, const unsigned long unsigned long size, unsigned long n); unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n); +unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, + const unsigned long *addr3, unsigned long size, + unsigned long n); extern unsigned long _find_first_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size); extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); @@ -255,6 +258,36 @@ unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned lon return __find_nth_andnot_bit(addr1, addr2, size, n); } +/** + * find_nth_and_andnot_bit - find N'th set bit in 2 memory regions, + * excluding those set in 3rd region + * @addr1: The 1st address to start the search at + * @addr2: The 2nd address to start the search at + * @addr3: The 3rd address to start the search at + * @size: The maximum number of bits to search + * @n: The number of set bit, which position is needed, counting from 0 + * + * Returns the bit number of the N'th set bit. + * If no such, returns @size. + */ +static __always_inline +unsigned long find_nth_and_andnot_bit(const unsigned long *addr1, + const unsigned long *addr2, + const unsigned long *addr3, + unsigned long size, unsigned long n) +{ + if (n >= size) + return size; + + if (small_const_nbits(size)) { + unsigned long val = *addr1 & *addr2 & (~*addr3) & GENMASK(size - 1, 0); + + return val ? fns(val, n) : size; + } + + return __find_nth_and_andnot_bit(addr1, addr2, addr3, size, n); +} + #ifndef find_first_and_bit /** * find_first_and_bit - find the first set bit in both memory regions diff --git a/include/linux/topology.h b/include/linux/topology.h index 4564faafd0e1..fea32377f7c7 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -245,5 +245,38 @@ static inline const struct cpumask *cpu_cpu_mask(int cpu) return cpumask_of_node(cpu_to_node(cpu)); } +#ifdef CONFIG_NUMA +int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node); +extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops); +#else +static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node) +{ + return cpumask_nth(cpu, cpus); +} + +static inline const struct cpumask * +sched_numa_hop_mask(unsigned int node, unsigned int hops) +{ + return ERR_PTR(-EOPNOTSUPP); +} +#endif /* CONFIG_NUMA */ + +/** + * for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance + * from a given node. + * @mask: the iteration variable. + * @node: the NUMA node to start the search from. + * + * Requires rcu_lock to be held. + * + * Yields cpu_online_mask for @node == NUMA_NO_NODE. + */ +#define for_each_numa_hop_mask(mask, node) \ + for (unsigned int __hops = 0; \ + mask = (node != NUMA_NO_NODE || __hops) ? \ + sched_numa_hop_mask(node, __hops) : \ + cpu_online_mask, \ + !IS_ERR_OR_NULL(mask); \ + __hops++) #endif /* _LINUX_TOPOLOGY_H */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 8739c2a5a54e..1233affc106c 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -3,6 +3,8 @@ * Scheduler topology setup/handling methods */ +#include <linux/bsearch.h> + DEFINE_MUTEX(sched_domains_mutex); /* Protected by sched_domains_mutex: */ @@ -2067,6 +2069,94 @@ unlock: return found; } +struct __cmp_key { + const struct cpumask *cpus; + struct cpumask ***masks; + int node; + int cpu; + int w; +}; + +static int hop_cmp(const void *a, const void *b) +{ + struct cpumask **prev_hop = *((struct cpumask ***)b - 1); + struct cpumask **cur_hop = *(struct cpumask ***)b; + struct __cmp_key *k = (struct __cmp_key *)a; + + if (cpumask_weight_and(k->cpus, cur_hop[k->node]) <= k->cpu) + return 1; + + k->w = (b == k->masks) ? 0 : cpumask_weight_and(k->cpus, prev_hop[k->node]); + if (k->w <= k->cpu) + return 0; + + return -1; +} + +/* + * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth next cpu + * closest to @cpu from @cpumask. + * cpumask: cpumask to find a cpu from + * cpu: Nth cpu to find + * + * returns: cpu, or nr_cpu_ids when nothing found. + */ +int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node) +{ + struct __cmp_key k = { .cpus = cpus, .node = node, .cpu = cpu }; + struct cpumask ***hop_masks; + int hop, ret = nr_cpu_ids; + + rcu_read_lock(); + + k.masks = rcu_dereference(sched_domains_numa_masks); + if (!k.masks) + goto unlock; + + hop_masks = bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), hop_cmp); + hop = hop_masks - k.masks; + + ret = hop ? + cpumask_nth_and_andnot(cpu - k.w, cpus, k.masks[hop][node], k.masks[hop-1][node]) : + cpumask_nth_and(cpu, cpus, k.masks[0][node]); +unlock: + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(sched_numa_find_nth_cpu); + +/** + * sched_numa_hop_mask() - Get the cpumask of CPUs at most @hops hops away from + * @node + * @node: The node to count hops from. + * @hops: Include CPUs up to that many hops away. 0 means local node. + * + * Return: On success, a pointer to a cpumask of CPUs at most @hops away from + * @node, an error value otherwise. + * + * Requires rcu_lock to be held. Returned cpumask is only valid within that + * read-side section, copy it if required beyond that. + * + * Note that not all hops are equal in distance; see sched_init_numa() for how + * distances and masks are handled. + * Also note that this is a reflection of sched_domains_numa_masks, which may change + * during the lifetime of the system (offline nodes are taken out of the masks). + */ +const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops) +{ + struct cpumask ***masks; + + if (node >= nr_node_ids || hops >= sched_domains_numa_levels) + return ERR_PTR(-EINVAL); + + masks = rcu_dereference(sched_domains_numa_masks); + if (!masks) + return ERR_PTR(-EBUSY); + + return masks[hops][node]; +} +EXPORT_SYMBOL_GPL(sched_numa_hop_mask); + #endif /* CONFIG_NUMA */ static int __sdt_alloc(const struct cpumask *cpu_map) diff --git a/lib/cpumask.c b/lib/cpumask.c index c7c392514fd3..e7258836b60b 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -110,15 +110,33 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask) #endif /** - * cpumask_local_spread - select the i'th cpu with local numa cpu's first + * cpumask_local_spread - select the i'th cpu based on NUMA distances * @i: index number * @node: local numa_node * - * This function selects an online CPU according to a numa aware policy; - * local cpus are returned first, followed by non-local ones, then it - * wraps around. + * Returns online CPU according to a numa aware policy; local cpus are returned + * first, followed by non-local ones, then it wraps around. * - * It's not very efficient, but useful for setup. + * For those who wants to enumerate all CPUs based on their NUMA distances, + * i.e. call this function in a loop, like: + * + * for (i = 0; i < num_online_cpus(); i++) { + * cpu = cpumask_local_spread(i, node); + * do_something(cpu); + * } + * + * There's a better alternative based on for_each()-like iterators: + * + * for_each_numa_hop_mask(mask, node) { + * for_each_cpu_andnot(cpu, mask, prev) + * do_something(cpu); + * prev = mask; + * } + * + * It's simpler and more verbose than above. Complexity of iterator-based + * enumeration is O(sched_domains_numa_levels * nr_cpu_ids), while + * cpumask_local_spread() when called for each cpu is + * O(sched_domains_numa_levels * nr_cpu_ids * log(nr_cpu_ids)). */ unsigned int cpumask_local_spread(unsigned int i, int node) { @@ -127,24 +145,12 @@ unsigned int cpumask_local_spread(unsigned int i, int node) /* Wrap: we always want a cpu. */ i %= num_online_cpus(); - if (node == NUMA_NO_NODE) { - cpu = cpumask_nth(i, cpu_online_mask); - if (cpu < nr_cpu_ids) - return cpu; - } else { - /* NUMA first. */ - cpu = cpumask_nth_and(i, cpu_online_mask, cpumask_of_node(node)); - if (cpu < nr_cpu_ids) - return cpu; - - i -= cpumask_weight_and(cpu_online_mask, cpumask_of_node(node)); - - /* Skip NUMA nodes, done above. */ - cpu = cpumask_nth_andnot(i, cpu_online_mask, cpumask_of_node(node)); - if (cpu < nr_cpu_ids) - return cpu; - } - BUG(); + cpu = (node == NUMA_NO_NODE) ? + cpumask_nth(i, cpu_online_mask) : + sched_numa_find_nth_cpu(cpu_online_mask, i, node); + + WARN_ON(cpu >= nr_cpu_ids); + return cpu; } EXPORT_SYMBOL(cpumask_local_spread); diff --git a/lib/find_bit.c b/lib/find_bit.c index 18bc0a7ac8ee..c10920e66788 100644 --- a/lib/find_bit.c +++ b/lib/find_bit.c @@ -155,6 +155,15 @@ unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned l } EXPORT_SYMBOL(__find_nth_andnot_bit); +unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1, + const unsigned long *addr2, + const unsigned long *addr3, + unsigned long size, unsigned long n) +{ + return FIND_NTH_BIT(addr1[idx] & addr2[idx] & ~addr3[idx], size, n); +} +EXPORT_SYMBOL(__find_nth_and_andnot_bit); + #ifndef find_next_and_bit unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start) |