From 80367ad01d93ac781b0e1df246edaf006928002f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 16 Apr 2025 18:29:12 +0200 Subject: futex: Add basic infrastructure for local task local hash The futex hash is system wide and shared by all tasks. Each slot is hashed based on futex address and the VMA of the thread. Due to randomized VMAs (and memory allocations) the same logical lock (pointer) can end up in a different hash bucket on each invocation of the application. This in turn means that different applications may share a hash bucket on the first invocation but not on the second and it is not always clear which applications will be involved. This can result in high latency's to acquire the futex_hash_bucket::lock especially if the lock owner is limited to a CPU and can not be effectively PI boosted. Introduce basic infrastructure for process local hash which is shared by all threads of process. This hash will only be used for a PROCESS_PRIVATE FUTEX operation. The hashmap can be allocated via: prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_SET_SLOTS, num); A `num' of 0 means that the global hash is used instead of a private hash. Other values for `num' specify the number of slots for the hash and the number must be power of two, starting with two. The prctl() returns zero on success. This function can only be used before a thread is created. The current status for the private hash can be queried via: num = prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_GET_SLOTS); which return the current number of slots. The value 0 means that the global hash is used. Values greater than 0 indicate the number of slots that are used. A negative number indicates an error. For optimisation, for the private hash jhash2() uses only two arguments the address and the offset. This omits the VMA which is always the same. [peterz: Use 0 for global hash. A bit shuffling and renaming. ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250416162921.513656-13-bigeasy@linutronix.de --- include/uapi/linux/prctl.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 15c18ef4eb11..3b93fb906e3c 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -364,4 +364,9 @@ struct prctl_mm_map { # define PR_TIMER_CREATE_RESTORE_IDS_ON 1 # define PR_TIMER_CREATE_RESTORE_IDS_GET 2 +/* FUTEX hash management */ +#define PR_FUTEX_HASH 78 +# define PR_FUTEX_HASH_SET_SLOTS 1 +# define PR_FUTEX_HASH_GET_SLOTS 2 + #endif /* _LINUX_PRCTL_H */ -- cgit v1.2.3 From 63e8595c060a1fef421e3eecfc05ad882dafb8ac Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 16 Apr 2025 18:29:15 +0200 Subject: futex: Allow to make the private hash immutable My initial testing showed that: perf bench futex hash reported less operations/sec with private hash. After using the same amount of buckets in the private hash as used by the global hash then the operations/sec were about the same. This changed once the private hash became resizable. This feature added an RCU section and reference counting via atomic inc+dec operation into the hot path. The reference counting can be avoided if the private hash is made immutable. Extend PR_FUTEX_HASH_SET_SLOTS by a fourth argument which denotes if the private should be made immutable. Once set (to true) the a further resize is not allowed (same if set to global hash). Add PR_FUTEX_HASH_GET_IMMUTABLE which returns true if the hash can not be changed. Update "perf bench" suite. For comparison, results of "perf bench futex hash -s": - Xeon CPU E5-2650, 2 NUMA nodes, total 32 CPUs: - Before the introducing task local hash shared Averaged 1.487.148 operations/sec (+- 0,53%), total secs = 10 private Averaged 2.192.405 operations/sec (+- 0,07%), total secs = 10 - With the series shared Averaged 1.326.342 operations/sec (+- 0,41%), total secs = 10 -b128 Averaged 141.394 operations/sec (+- 1,15%), total secs = 10 -Ib128 Averaged 851.490 operations/sec (+- 0,67%), total secs = 10 -b8192 Averaged 131.321 operations/sec (+- 2,13%), total secs = 10 -Ib8192 Averaged 1.923.077 operations/sec (+- 0,61%), total secs = 10 128 is the default allocation of hash buckets. 8192 was the previous amount of allocated hash buckets. - Xeon(R) CPU E7-8890 v3, 4 NUMA nodes, total 144 CPUs: - Before the introducing task local hash shared Averaged 1.810.936 operations/sec (+- 0,26%), total secs = 20 private Averaged 2.505.801 operations/sec (+- 0,05%), total secs = 20 - With the series shared Averaged 1.589.002 operations/sec (+- 0,25%), total secs = 20 -b1024 Averaged 42.410 operations/sec (+- 0,20%), total secs = 20 -Ib1024 Averaged 740.638 operations/sec (+- 1,51%), total secs = 20 -b65536 Averaged 48.811 operations/sec (+- 1,35%), total secs = 20 -Ib65536 Averaged 1.963.165 operations/sec (+- 0,18%), total secs = 20 1024 is the default allocation of hash buckets. 65536 was the previous amount of allocated hash buckets. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Acked-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20250416162921.513656-16-bigeasy@linutronix.de --- include/uapi/linux/prctl.h | 2 ++ kernel/futex/core.c | 49 ++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 45 insertions(+), 6 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 3b93fb906e3c..43dec6eed559 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -367,6 +367,8 @@ struct prctl_mm_map { /* FUTEX hash management */ #define PR_FUTEX_HASH 78 # define PR_FUTEX_HASH_SET_SLOTS 1 +# define FH_FLAG_IMMUTABLE (1ULL << 0) # define PR_FUTEX_HASH_GET_SLOTS 2 +# define PR_FUTEX_HASH_GET_IMMUTABLE 3 #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 9e7dad52abea..8054fda94719 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -63,6 +63,7 @@ struct futex_private_hash { struct rcu_head rcu; void *mm; bool custom; + bool immutable; struct futex_hash_bucket queues[]; }; @@ -132,12 +133,16 @@ static inline bool futex_key_is_private(union futex_key *key) bool futex_private_hash_get(struct futex_private_hash *fph) { + if (fph->immutable) + return true; return rcuref_get(&fph->users); } void futex_private_hash_put(struct futex_private_hash *fph) { /* Ignore return value, last put is verified via rcuref_is_dead() */ + if (fph->immutable) + return; if (rcuref_put(&fph->users)) wake_up_var(fph->mm); } @@ -277,6 +282,8 @@ again: if (!fph) return NULL; + if (fph->immutable) + return fph; if (rcuref_get(&fph->users)) return fph; } @@ -1383,6 +1390,9 @@ static void futex_hash_bucket_init(struct futex_hash_bucket *fhb, spin_lock_init(&fhb->lock); } +#define FH_CUSTOM 0x01 +#define FH_IMMUTABLE 0x02 + #ifdef CONFIG_FUTEX_PRIVATE_HASH void futex_hash_free(struct mm_struct *mm) { @@ -1433,10 +1443,11 @@ static bool futex_hash_less(struct futex_private_hash *a, return false; /* equal */ } -static int futex_hash_allocate(unsigned int hash_slots, bool custom) +static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) { struct mm_struct *mm = current->mm; struct futex_private_hash *fph; + bool custom = flags & FH_CUSTOM; int i; if (hash_slots && (hash_slots == 1 || !is_power_of_2(hash_slots))) @@ -1447,7 +1458,7 @@ static int futex_hash_allocate(unsigned int hash_slots, bool custom) */ scoped_guard(rcu) { fph = rcu_dereference(mm->futex_phash); - if (fph && !fph->hash_mask) { + if (fph && (!fph->hash_mask || fph->immutable)) { if (custom) return -EBUSY; return 0; @@ -1461,6 +1472,7 @@ static int futex_hash_allocate(unsigned int hash_slots, bool custom) rcuref_init(&fph->users, 1); fph->hash_mask = hash_slots ? hash_slots - 1 : 0; fph->custom = custom; + fph->immutable = !!(flags & FH_IMMUTABLE); fph->mm = mm; for (i = 0; i < hash_slots; i++) @@ -1553,7 +1565,7 @@ int futex_hash_allocate_default(void) if (current_buckets >= buckets) return 0; - return futex_hash_allocate(buckets, false); + return futex_hash_allocate(buckets, 0); } static int futex_hash_get_slots(void) @@ -1567,9 +1579,22 @@ static int futex_hash_get_slots(void) return 0; } +static int futex_hash_get_immutable(void) +{ + struct futex_private_hash *fph; + + guard(rcu)(); + fph = rcu_dereference(current->mm->futex_phash); + if (fph && fph->immutable) + return 1; + if (fph && !fph->hash_mask) + return 1; + return 0; +} + #else -static int futex_hash_allocate(unsigned int hash_slots, bool custom) +static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) { return -EINVAL; } @@ -1578,23 +1603,35 @@ static int futex_hash_get_slots(void) { return 0; } + +static int futex_hash_get_immutable(void) +{ + return 0; +} #endif int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4) { + unsigned int flags = FH_CUSTOM; int ret; switch (arg2) { case PR_FUTEX_HASH_SET_SLOTS: - if (arg4 != 0) + if (arg4 & ~FH_FLAG_IMMUTABLE) return -EINVAL; - ret = futex_hash_allocate(arg3, true); + if (arg4 & FH_FLAG_IMMUTABLE) + flags |= FH_IMMUTABLE; + ret = futex_hash_allocate(arg3, flags); break; case PR_FUTEX_HASH_GET_SLOTS: ret = futex_hash_get_slots(); break; + case PR_FUTEX_HASH_GET_IMMUTABLE: + ret = futex_hash_get_immutable(); + break; + default: ret = -EINVAL; break; -- cgit v1.2.3 From cec199c5e39bde7191a08087cc3d002ccfab31ff Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Apr 2025 18:29:16 +0200 Subject: futex: Implement FUTEX2_NUMA Extend the futex2 interface to be numa aware. When FUTEX2_NUMA is specified for a futex, the user value is extended to two words (of the same size). The first is the user value we all know, the second one will be the node to place this futex on. struct futex_numa_32 { u32 val; u32 node; }; When node is set to ~0, WAIT will set it to the current node_id such that WAKE knows where to find it. If userspace corrupts the node value between WAIT and WAKE, the futex will not be found and no wakeup will happen. When FUTEX2_NUMA is not set, the node is simply an extension of the hash, such that traditional futexes are still interleaved over the nodes. This is done to avoid having to have a separate !numa hash-table. [bigeasy: ensure to have at least hashsize of 4 in futex_init(), add pr_info() for size and allocation information. Cast the naddr math to void*] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250416162921.513656-17-bigeasy@linutronix.de --- include/linux/futex.h | 3 ++ include/uapi/linux/futex.h | 7 ++++ kernel/futex/core.c | 100 +++++++++++++++++++++++++++++++++++++-------- kernel/futex/futex.h | 33 +++++++++++++-- 4 files changed, 123 insertions(+), 20 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/futex.h b/include/linux/futex.h index 40bc778b2bb4..eccc99751bd9 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -34,6 +34,7 @@ union futex_key { u64 i_seq; unsigned long pgoff; unsigned int offset; + /* unsigned int node; */ } shared; struct { union { @@ -42,11 +43,13 @@ union futex_key { }; unsigned long address; unsigned int offset; + /* unsigned int node; */ } private; struct { u64 ptr; unsigned long word; unsigned int offset; + unsigned int node; /* NOT hashed! */ } both; }; diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h index d2ee625ea189..6b94da467e70 100644 --- a/include/uapi/linux/futex.h +++ b/include/uapi/linux/futex.h @@ -74,6 +74,13 @@ /* do not use */ #define FUTEX_32 FUTEX2_SIZE_U32 /* historical accident :-( */ +/* + * When FUTEX2_NUMA doubles the futex word, the second word is a node value. + * The special value -1 indicates no-node. This is the same value as + * NUMA_NO_NODE, except that value is not ABI, this is. + */ +#define FUTEX_NO_NODE (-1) + /* * Max numbers of elements in a futex_waitv array */ diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 8054fda94719..1490e6492993 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -36,6 +36,8 @@ #include #include #include +#include +#include #include #include #include @@ -51,11 +53,14 @@ * reside in the same cacheline. */ static struct { - struct futex_hash_bucket *queues; unsigned long hashmask; + unsigned int hashshift; + struct futex_hash_bucket *queues[MAX_NUMNODES]; } __futex_data __read_mostly __aligned(2*sizeof(long)); -#define futex_queues (__futex_data.queues) -#define futex_hashmask (__futex_data.hashmask) + +#define futex_hashmask (__futex_data.hashmask) +#define futex_hashshift (__futex_data.hashshift) +#define futex_queues (__futex_data.queues) struct futex_private_hash { rcuref_t users; @@ -339,15 +344,35 @@ __futex_hash(union futex_key *key, struct futex_private_hash *fph) { struct futex_hash_bucket *hb; u32 hash; + int node; hb = __futex_hash_private(key, fph); if (hb) return hb; hash = jhash2((u32 *)key, - offsetof(typeof(*key), both.offset) / 4, + offsetof(typeof(*key), both.offset) / sizeof(u32), key->both.offset); - return &futex_queues[hash & futex_hashmask]; + node = key->both.node; + + if (node == FUTEX_NO_NODE) { + /* + * In case of !FLAGS_NUMA, use some unused hash bits to pick a + * node -- this ensures regular futexes are interleaved across + * the nodes and avoids having to allocate multiple + * hash-tables. + * + * NOTE: this isn't perfectly uniform, but it is fast and + * handles sparse node masks. + */ + node = (hash >> futex_hashshift) % nr_node_ids; + if (!node_possible(node)) { + node = find_next_bit_wrap(node_possible_map.bits, + nr_node_ids, node); + } + } + + return &futex_queues[node][hash & futex_hashmask]; } /** @@ -454,25 +479,49 @@ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, struct page *page; struct folio *folio; struct address_space *mapping; - int err, ro = 0; + int node, err, size, ro = 0; bool fshared; fshared = flags & FLAGS_SHARED; + size = futex_size(flags); + if (flags & FLAGS_NUMA) + size *= 2; /* * The futex address must be "naturally" aligned. */ key->both.offset = address % PAGE_SIZE; - if (unlikely((address % sizeof(u32)) != 0)) + if (unlikely((address % size) != 0)) return -EINVAL; address -= key->both.offset; - if (unlikely(!access_ok(uaddr, sizeof(u32)))) + if (unlikely(!access_ok(uaddr, size))) return -EFAULT; if (unlikely(should_fail_futex(fshared))) return -EFAULT; + if (flags & FLAGS_NUMA) { + u32 __user *naddr = (void *)uaddr + size / 2; + + if (futex_get_value(&node, naddr)) + return -EFAULT; + + if (node == FUTEX_NO_NODE) { + node = numa_node_id(); + if (futex_put_value(node, naddr)) + return -EFAULT; + + } else if (node >= MAX_NUMNODES || !node_possible(node)) { + return -EINVAL; + } + + key->both.node = node; + + } else { + key->both.node = FUTEX_NO_NODE; + } + /* * PROCESS_PRIVATE futexes are fast. * As the mm cannot disappear under us and the 'key' only needs @@ -1642,24 +1691,41 @@ int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4) static int __init futex_init(void) { unsigned long hashsize, i; - unsigned int futex_shift; + unsigned int order, n; + unsigned long size; #ifdef CONFIG_BASE_SMALL hashsize = 16; #else - hashsize = roundup_pow_of_two(256 * num_possible_cpus()); + hashsize = 256 * num_possible_cpus(); + hashsize /= num_possible_nodes(); + hashsize = max(4, hashsize); + hashsize = roundup_pow_of_two(hashsize); #endif + futex_hashshift = ilog2(hashsize); + size = sizeof(struct futex_hash_bucket) * hashsize; + order = get_order(size); - futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), - hashsize, 0, 0, - &futex_shift, NULL, - hashsize, hashsize); - hashsize = 1UL << futex_shift; + for_each_node(n) { + struct futex_hash_bucket *table; - for (i = 0; i < hashsize; i++) - futex_hash_bucket_init(&futex_queues[i], NULL); + if (order > MAX_PAGE_ORDER) + table = vmalloc_huge_node(size, GFP_KERNEL, n); + else + table = alloc_pages_exact_nid(n, size, GFP_KERNEL); + + BUG_ON(!table); + + for (i = 0; i < hashsize; i++) + futex_hash_bucket_init(&table[i], NULL); + + futex_queues[n] = table; + } futex_hashmask = hashsize - 1; + pr_info("futex hash table entries: %lu (%lu bytes on %d NUMA nodes, total %lu KiB, %s).\n", + hashsize, size, num_possible_nodes(), size * num_possible_nodes() / 1024, + order > MAX_PAGE_ORDER ? "vmalloc" : "linear"); return 0; } core_initcall(futex_init); diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index 899aed5acde1..acc795367889 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -54,7 +54,7 @@ static inline unsigned int futex_to_flags(unsigned int op) return flags; } -#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_PRIVATE) +#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_NUMA | FUTEX2_PRIVATE) /* FUTEX2_ to FLAGS_ */ static inline unsigned int futex2_to_flags(unsigned int flags2) @@ -87,6 +87,19 @@ static inline bool futex_flags_valid(unsigned int flags) if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32) return false; + /* + * Must be able to represent both FUTEX_NO_NODE and every valid nodeid + * in a futex word. + */ + if (flags & FLAGS_NUMA) { + int bits = 8 * futex_size(flags); + u64 max = ~0ULL; + + max >>= 64 - bits; + if (nr_node_ids >= max) + return false; + } + return true; } @@ -282,7 +295,7 @@ static inline int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 * This looks a bit overkill, but generally just results in a couple * of instructions. */ -static __always_inline int futex_read_inatomic(u32 *dest, u32 __user *from) +static __always_inline int futex_get_value(u32 *dest, u32 __user *from) { u32 val; @@ -299,12 +312,26 @@ Efault: return -EFAULT; } +static __always_inline int futex_put_value(u32 val, u32 __user *to) +{ + if (can_do_masked_user_access()) + to = masked_user_access_begin(to); + else if (!user_read_access_begin(to, sizeof(*to))) + return -EFAULT; + unsafe_put_user(val, to, Efault); + user_read_access_end(); + return 0; +Efault: + user_read_access_end(); + return -EFAULT; +} + static inline int futex_get_value_locked(u32 *dest, u32 __user *from) { int ret; pagefault_disable(); - ret = futex_read_inatomic(dest, from); + ret = futex_get_value(dest, from); pagefault_enable(); return ret; -- cgit v1.2.3 From c042c505210dc3453f378df432c10fff3d471bc5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Apr 2025 18:29:17 +0200 Subject: futex: Implement FUTEX2_MPOL Extend the futex2 interface to be aware of mempolicy. When FUTEX2_MPOL is specified and there is a MPOL_PREFERRED or home_node specified covering the futex address, use that hash-map. Notably, in this case the futex will go to the global node hashtable, even if it is a PRIVATE futex. When FUTEX2_NUMA|FUTEX2_MPOL is specified and the user specified node value is FUTEX_NO_NODE, the MPOL lookup (as described above) will be tried first before reverting to setting node to the local node. [bigeasy: add CONFIG_FUTEX_MPOL, add MPOL to FUTEX2_VALID_MASK, write the node only to user if FUTEX_NO_NODE was supplied] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250416162921.513656-18-bigeasy@linutronix.de --- include/linux/mmap_lock.h | 4 ++ include/uapi/linux/futex.h | 2 +- init/Kconfig | 5 ++ kernel/futex/core.c | 116 ++++++++++++++++++++++++++++++++++++++------- kernel/futex/futex.h | 6 ++- 5 files changed, 115 insertions(+), 18 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 4706c6769902..e0eddfd306ef 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -7,6 +7,7 @@ #include #include #include +#include #define MMAP_LOCK_INITIALIZER(name) \ .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock), @@ -211,6 +212,9 @@ static inline void mmap_read_unlock(struct mm_struct *mm) up_read(&mm->mmap_lock); } +DEFINE_GUARD(mmap_read_lock, struct mm_struct *, + mmap_read_lock(_T), mmap_read_unlock(_T)) + static inline void mmap_read_unlock_non_owner(struct mm_struct *mm) { __mmap_lock_trace_released(mm, false); diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h index 6b94da467e70..7e2744ec8933 100644 --- a/include/uapi/linux/futex.h +++ b/include/uapi/linux/futex.h @@ -63,7 +63,7 @@ #define FUTEX2_SIZE_U32 0x02 #define FUTEX2_SIZE_U64 0x03 #define FUTEX2_NUMA 0x04 - /* 0x08 */ +#define FUTEX2_MPOL 0x08 /* 0x10 */ /* 0x20 */ /* 0x40 */ diff --git a/init/Kconfig b/init/Kconfig index 4b84da2b2ec4..b373267ba2e2 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1704,6 +1704,11 @@ config FUTEX_PRIVATE_HASH depends on FUTEX && !BASE_SMALL && MMU default y +config FUTEX_MPOL + bool + depends on FUTEX && NUMA + default y + config EPOLL bool "Enable eventpoll support" if EXPERT default y diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 1490e6492993..19a2c65f3d37 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -43,6 +43,8 @@ #include #include #include +#include +#include #include "futex.h" #include "../locking/rtmutex_common.h" @@ -328,6 +330,75 @@ struct futex_hash_bucket *futex_hash(union futex_key *key) #endif /* CONFIG_FUTEX_PRIVATE_HASH */ +#ifdef CONFIG_FUTEX_MPOL + +static int __futex_key_to_node(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma = vma_lookup(mm, addr); + struct mempolicy *mpol; + int node = FUTEX_NO_NODE; + + if (!vma) + return FUTEX_NO_NODE; + + mpol = vma_policy(vma); + if (!mpol) + return FUTEX_NO_NODE; + + switch (mpol->mode) { + case MPOL_PREFERRED: + node = first_node(mpol->nodes); + break; + case MPOL_PREFERRED_MANY: + case MPOL_BIND: + if (mpol->home_node != NUMA_NO_NODE) + node = mpol->home_node; + break; + default: + break; + } + + return node; +} + +static int futex_key_to_node_opt(struct mm_struct *mm, unsigned long addr) +{ + int seq, node; + + guard(rcu)(); + + if (!mmap_lock_speculate_try_begin(mm, &seq)) + return -EBUSY; + + node = __futex_key_to_node(mm, addr); + + if (mmap_lock_speculate_retry(mm, seq)) + return -EAGAIN; + + return node; +} + +static int futex_mpol(struct mm_struct *mm, unsigned long addr) +{ + int node; + + node = futex_key_to_node_opt(mm, addr); + if (node >= FUTEX_NO_NODE) + return node; + + guard(mmap_read_lock)(mm); + return __futex_key_to_node(mm, addr); +} + +#else /* !CONFIG_FUTEX_MPOL */ + +static int futex_mpol(struct mm_struct *mm, unsigned long addr) +{ + return FUTEX_NO_NODE; +} + +#endif /* CONFIG_FUTEX_MPOL */ + /** * __futex_hash - Return the hash bucket * @key: Pointer to the futex key for which the hash is calculated @@ -342,18 +413,20 @@ struct futex_hash_bucket *futex_hash(union futex_key *key) static struct futex_hash_bucket * __futex_hash(union futex_key *key, struct futex_private_hash *fph) { - struct futex_hash_bucket *hb; + int node = key->both.node; u32 hash; - int node; - hb = __futex_hash_private(key, fph); - if (hb) - return hb; + if (node == FUTEX_NO_NODE) { + struct futex_hash_bucket *hb; + + hb = __futex_hash_private(key, fph); + if (hb) + return hb; + } hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / sizeof(u32), key->both.offset); - node = key->both.node; if (node == FUTEX_NO_NODE) { /* @@ -480,6 +553,7 @@ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, struct folio *folio; struct address_space *mapping; int node, err, size, ro = 0; + bool node_updated = false; bool fshared; fshared = flags & FLAGS_SHARED; @@ -501,27 +575,37 @@ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, if (unlikely(should_fail_futex(fshared))) return -EFAULT; + node = FUTEX_NO_NODE; + if (flags & FLAGS_NUMA) { u32 __user *naddr = (void *)uaddr + size / 2; if (futex_get_value(&node, naddr)) return -EFAULT; - if (node == FUTEX_NO_NODE) { - node = numa_node_id(); - if (futex_put_value(node, naddr)) - return -EFAULT; - - } else if (node >= MAX_NUMNODES || !node_possible(node)) { + if (node != FUTEX_NO_NODE && + (node >= MAX_NUMNODES || !node_possible(node))) return -EINVAL; - } + } - key->both.node = node; + if (node == FUTEX_NO_NODE && (flags & FLAGS_MPOL)) { + node = futex_mpol(mm, address); + node_updated = true; + } - } else { - key->both.node = FUTEX_NO_NODE; + if (flags & FLAGS_NUMA) { + u32 __user *naddr = (void *)uaddr + size / 2; + + if (node == FUTEX_NO_NODE) { + node = numa_node_id(); + node_updated = true; + } + if (node_updated && futex_put_value(node, naddr)) + return -EFAULT; } + key->both.node = node; + /* * PROCESS_PRIVATE futexes are fast. * As the mm cannot disappear under us and the 'key' only needs diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index acc795367889..069fc2a83080 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -39,6 +39,7 @@ #define FLAGS_HAS_TIMEOUT 0x0040 #define FLAGS_NUMA 0x0080 #define FLAGS_STRICT 0x0100 +#define FLAGS_MPOL 0x0200 /* FUTEX_ to FLAGS_ */ static inline unsigned int futex_to_flags(unsigned int op) @@ -54,7 +55,7 @@ static inline unsigned int futex_to_flags(unsigned int op) return flags; } -#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_NUMA | FUTEX2_PRIVATE) +#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_NUMA | FUTEX2_MPOL | FUTEX2_PRIVATE) /* FUTEX2_ to FLAGS_ */ static inline unsigned int futex2_to_flags(unsigned int flags2) @@ -67,6 +68,9 @@ static inline unsigned int futex2_to_flags(unsigned int flags2) if (flags2 & FUTEX2_NUMA) flags |= FLAGS_NUMA; + if (flags2 & FUTEX2_MPOL) + flags |= FLAGS_MPOL; + return flags; } -- cgit v1.2.3