summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
authorSuren Baghdasaryan <surenb@google.com>2025-02-14 01:46:49 +0300
committerAndrew Morton <akpm@linux-foundation.org>2025-03-17 08:06:20 +0300
commitf35ab95ca0af7a27feab57b9d7e906405bddb093 (patch)
treeedd5498aae7dadfe230f2488704c4be05f173a08 /include/linux
parent4e0dbe105d5088c77eb09de6f049aaf44711a2ec (diff)
downloadlinux-f35ab95ca0af7a27feab57b9d7e906405bddb093.tar.xz
mm: replace vm_lock and detached flag with a reference count
rw_semaphore is a sizable structure of 40 bytes and consumes considerable space for each vm_area_struct. However vma_lock has two important specifics which can be used to replace rw_semaphore with a simpler structure: 1. Readers never wait. They try to take the vma_lock and fall back to mmap_lock if that fails. 2. Only one writer at a time will ever try to write-lock a vma_lock because writers first take mmap_lock in write mode. Because of these requirements, full rw_semaphore functionality is not needed and we can replace rw_semaphore and the vma->detached flag with a refcount (vm_refcnt). When vma is in detached state, vm_refcnt is 0 and only a call to vma_mark_attached() can take it out of this state. Note that unlike before, now we enforce both vma_mark_attached() and vma_mark_detached() to be done only after vma has been write-locked. vma_mark_attached() changes vm_refcnt to 1 to indicate that it has been attached to the vma tree. When a reader takes read lock, it increments vm_refcnt, unless the top usable bit of vm_refcnt (0x40000000) is set, indicating presence of a writer. When writer takes write lock, it sets the top usable bit to indicate its presence. If there are readers, writer will wait using newly introduced mm->vma_writer_wait. Since all writers take mmap_lock in write mode first, there can be only one writer at a time. The last reader to release the lock will signal the writer to wake up. refcount might overflow if there are many competing readers, in which case read-locking will fail. Readers are expected to handle such failures. In summary: 1. all readers increment the vm_refcnt; 2. writer sets top usable (writer) bit of vm_refcnt; 3. readers cannot increment the vm_refcnt if the writer bit is set; 4. in the presence of readers, writer must wait for the vm_refcnt to drop to 1 (plus the VMA_LOCK_OFFSET writer bit), indicating an attached vma with no readers; 5. vm_refcnt overflow is handled by the readers. While this vm_lock replacement does not yet result in a smaller vm_area_struct (it stays at 256 bytes due to cacheline alignment), it allows for further size optimization by structure member regrouping to bring the size of vm_area_struct below 192 bytes. [surenb@google.com: fix a crash due to vma_end_read() that should have been removed] Link: https://lkml.kernel.org/r/20250220200208.323769-1-surenb@google.com Link: https://lkml.kernel.org/r/20250213224655.1680278-13-surenb@google.com Signed-off-by: Suren Baghdasaryan <surenb@google.com> Suggested-by: Peter Zijlstra <peterz@infradead.org> Suggested-by: Matthew Wilcox <willy@infradead.org> Tested-by: Shivank Garg <shivankg@amd.com> Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Cc: Christian Brauner <brauner@kernel.org> Cc: David Hildenbrand <david@redhat.com> Cc: David Howells <dhowells@redhat.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Hugh Dickins <hughd@google.com> Cc: Jann Horn <jannh@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Klara Modin <klarasmodin@gmail.com> Cc: Liam R. Howlett <Liam.Howlett@Oracle.com> Cc: Lokesh Gidra <lokeshgidra@google.com> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Cc: Mateusz Guzik <mjguzik@gmail.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Minchan Kim <minchan@google.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Pasha Tatashin <pasha.tatashin@soleen.com> Cc: "Paul E . McKenney" <paulmck@kernel.org> Cc: Peter Xu <peterx@redhat.com> Cc: Shakeel Butt <shakeel.butt@linux.dev> Cc: Sourav Panda <souravpanda@google.com> Cc: Wei Yang <richard.weiyang@gmail.com> Cc: Will Deacon <will@kernel.org> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/mm.h128
-rw-r--r--include/linux/mm_types.h22
2 files changed, 95 insertions, 55 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c24c521e38a2..06f179c844c3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -32,6 +32,7 @@
#include <linux/memremap.h>
#include <linux/slab.h>
#include <linux/cacheinfo.h>
+#include <linux/rcuwait.h>
struct mempolicy;
struct anon_vma;
@@ -697,19 +698,54 @@ static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
#endif /* CONFIG_NUMA_BALANCING */
#ifdef CONFIG_PER_VMA_LOCK
-static inline void vma_lock_init(struct vm_area_struct *vma)
+static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt)
{
- init_rwsem(&vma->vm_lock.lock);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ static struct lock_class_key lockdep_key;
+
+ lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0);
+#endif
+ if (reset_refcnt)
+ refcount_set(&vma->vm_refcnt, 0);
vma->vm_lock_seq = UINT_MAX;
}
+static inline bool is_vma_writer_only(int refcnt)
+{
+ /*
+ * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma
+ * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on
+ * a detached vma happens only in vma_mark_detached() and is a rare
+ * case, therefore most of the time there will be no unnecessary wakeup.
+ */
+ return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1;
+}
+
+static inline void vma_refcount_put(struct vm_area_struct *vma)
+{
+ /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */
+ struct mm_struct *mm = vma->vm_mm;
+ int oldcnt;
+
+ rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
+ if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) {
+
+ if (is_vma_writer_only(oldcnt - 1))
+ rcuwait_wake_up(&mm->vma_writer_wait);
+ }
+}
+
/*
* Try to read-lock a vma. The function is allowed to occasionally yield false
* locked result to avoid performance overhead, in which case we fall back to
* using mmap_lock. The function should never yield false unlocked result.
+ * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
+ * detached.
*/
-static inline bool vma_start_read(struct vm_area_struct *vma)
+static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma)
{
+ int oldcnt;
+
/*
* Check before locking. A race might cause false locked result.
* We can use READ_ONCE() for the mm_lock_seq here, and don't need
@@ -718,15 +754,25 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
* need ordering is below.
*/
if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence))
- return false;
+ return NULL;
- if (unlikely(down_read_trylock(&vma->vm_lock.lock) == 0))
- return false;
+ /*
+ * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire()
+ * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET.
+ * Acquire fence is required here to avoid reordering against later
+ * vm_lock_seq check and checks inside lock_vma_under_rcu().
+ */
+ if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
+ VMA_REF_LIMIT))) {
+ /* return EAGAIN if vma got detached from under us */
+ return oldcnt ? NULL : ERR_PTR(-EAGAIN);
+ }
+ rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
/*
- * Overflow might produce false locked result.
+ * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
* False unlocked result is impossible because we modify and check
- * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
+ * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
* modification invalidates all existing locks.
*
* We must use ACQUIRE semantics for the mm_lock_seq so that if we are
@@ -735,10 +781,11 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
* This pairs with RELEASE semantics in vma_end_write_all().
*/
if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) {
- up_read(&vma->vm_lock.lock);
- return false;
+ vma_refcount_put(vma);
+ return NULL;
}
- return true;
+
+ return vma;
}
/*
@@ -749,8 +796,14 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
*/
static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass)
{
+ int oldcnt;
+
mmap_assert_locked(vma->vm_mm);
- down_read_nested(&vma->vm_lock.lock, subclass);
+ if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
+ VMA_REF_LIMIT)))
+ return false;
+
+ rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
return true;
}
@@ -762,16 +815,12 @@ static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int
*/
static inline bool vma_start_read_locked(struct vm_area_struct *vma)
{
- mmap_assert_locked(vma->vm_mm);
- down_read(&vma->vm_lock.lock);
- return true;
+ return vma_start_read_locked_nested(vma, 0);
}
static inline void vma_end_read(struct vm_area_struct *vma)
{
- rcu_read_lock(); /* keeps vma alive till the end of up_read */
- up_read(&vma->vm_lock.lock);
- rcu_read_unlock();
+ vma_refcount_put(vma);
}
/* WARNING! Can only be used if mmap_lock is expected to be write-locked */
@@ -813,38 +862,35 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma)
static inline void vma_assert_locked(struct vm_area_struct *vma)
{
- if (!rwsem_is_locked(&vma->vm_lock.lock))
- vma_assert_write_locked(vma);
+ unsigned int mm_lock_seq;
+
+ VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 &&
+ !__is_vma_write_locked(vma, &mm_lock_seq), vma);
}
+/*
+ * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these
+ * assertions should be made either under mmap_write_lock or when the object
+ * has been isolated under mmap_write_lock, ensuring no competing writers.
+ */
static inline void vma_assert_attached(struct vm_area_struct *vma)
{
- WARN_ON_ONCE(vma->detached);
+ WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt));
}
static inline void vma_assert_detached(struct vm_area_struct *vma)
{
- WARN_ON_ONCE(!vma->detached);
+ WARN_ON_ONCE(refcount_read(&vma->vm_refcnt));
}
static inline void vma_mark_attached(struct vm_area_struct *vma)
{
- vma_assert_detached(vma);
- vma->detached = false;
-}
-
-static inline void vma_mark_detached(struct vm_area_struct *vma)
-{
- /* When detaching vma should be write-locked */
vma_assert_write_locked(vma);
- vma_assert_attached(vma);
- vma->detached = true;
+ vma_assert_detached(vma);
+ refcount_set(&vma->vm_refcnt, 1);
}
-static inline bool is_vma_detached(struct vm_area_struct *vma)
-{
- return vma->detached;
-}
+void vma_mark_detached(struct vm_area_struct *vma);
static inline void release_fault_lock(struct vm_fault *vmf)
{
@@ -867,9 +913,9 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
#else /* CONFIG_PER_VMA_LOCK */
-static inline void vma_lock_init(struct vm_area_struct *vma) {}
-static inline bool vma_start_read(struct vm_area_struct *vma)
- { return false; }
+static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {}
+static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma)
+ { return NULL; }
static inline void vma_end_read(struct vm_area_struct *vma) {}
static inline void vma_start_write(struct vm_area_struct *vma) {}
static inline void vma_assert_write_locked(struct vm_area_struct *vma)
@@ -910,12 +956,8 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
vma->vm_mm = mm;
vma->vm_ops = &vma_dummy_vm_ops;
INIT_LIST_HEAD(&vma->anon_vma_chain);
-#ifdef CONFIG_PER_VMA_LOCK
- /* vma is not locked, can't use vma_mark_detached() */
- vma->detached = true;
-#endif
vma_numab_state_init(vma);
- vma_lock_init(vma);
+ vma_lock_init(vma, false);
}
/* Use when VMA is not part of the VMA tree and needs no locking */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 36dea20cd101..9de0a6cb3c2d 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -19,6 +19,7 @@
#include <linux/workqueue.h>
#include <linux/seqlock.h>
#include <linux/percpu_counter.h>
+#include <linux/types.h>
#include <asm/mmu.h>
@@ -629,9 +630,8 @@ static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
}
#endif
-struct vma_lock {
- struct rw_semaphore lock;
-};
+#define VMA_LOCK_OFFSET 0x40000000
+#define VMA_REF_LIMIT (VMA_LOCK_OFFSET - 1)
struct vma_numab_state {
/*
@@ -710,18 +710,12 @@ struct vm_area_struct {
#ifdef CONFIG_PER_VMA_LOCK
/*
- * Flag to indicate areas detached from the mm->mm_mt tree.
- * Unstable RCU readers are allowed to read this.
- */
- bool detached;
-
- /*
* Can only be written (using WRITE_ONCE()) while holding both:
* - mmap_lock (in write mode)
- * - vm_lock->lock (in write mode)
+ * - vm_refcnt bit at VMA_LOCK_OFFSET is set
* Can be read reliably while holding one of:
* - mmap_lock (in read or write mode)
- * - vm_lock->lock (in read or write mode)
+ * - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1
* Can be read unreliably (using READ_ONCE()) for pessimistic bailout
* while holding nothing (except RCU to keep the VMA struct allocated).
*
@@ -784,7 +778,10 @@ struct vm_area_struct {
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
#ifdef CONFIG_PER_VMA_LOCK
/* Unstable RCU readers are allowed to read this. */
- struct vma_lock vm_lock ____cacheline_aligned_in_smp;
+ refcount_t vm_refcnt ____cacheline_aligned_in_smp;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map vmlock_dep_map;
+#endif
#endif
} __randomize_layout;
@@ -920,6 +917,7 @@ struct mm_struct {
* by mmlist_lock
*/
#ifdef CONFIG_PER_VMA_LOCK
+ struct rcuwait vma_writer_wait;
/*
* This field has lock-like semantics, meaning it is sometimes
* accessed with ACQUIRE/RELEASE semantics.