summaryrefslogtreecommitdiff
path: root/fs/btrfs/locking.c
blob: 14ceb2ce33ac057d0d397588aa77ea8bb06e9a16 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2008 Oracle.  All rights reserved.
 */

#include <linux/sched.h>
#include <linux/pagemap.h>
#include <linux/spinlock.h>
#include <linux/page-flags.h>
#include <asm/bug.h>
#include "misc.h"
#include "ctree.h"
#include "extent_io.h"
#include "locking.h"

/*
 * Extent buffer locking
 * =====================
 *
 * The locks use a custom scheme that allows to do more operations than are
 * available fromt current locking primitives. The building blocks are still
 * rwlock and wait queues.
 *
 * Required semantics:
 *
 * - reader/writer exclusion
 * - writer/writer exclusion
 * - reader/reader sharing
 * - spinning lock semantics
 * - blocking lock semantics
 * - try-lock semantics for readers and writers
 * - one level nesting, allowing read lock to be taken by the same thread that
 *   already has write lock
 *
 * The extent buffer locks (also called tree locks) manage access to eb data
 * related to the storage in the b-tree (keys, items, but not the individual
 * members of eb).
 * We want concurrency of many readers and safe updates. The underlying locking
 * is done by read-write spinlock and the blocking part is implemented using
 * counters and wait queues.
 *
 * spinning semantics - the low-level rwlock is held so all other threads that
 *                      want to take it are spinning on it.
 *
 * blocking semantics - the low-level rwlock is not held but the counter
 *                      denotes how many times the blocking lock was held;
 *                      sleeping is possible
 *
 * Write lock always allows only one thread to access the data.
 *
 *
 * Debugging
 * ---------
 *
 * There are additional state counters that are asserted in various contexts,
 * removed from non-debug build to reduce extent_buffer size and for
 * performance reasons.
 *
 *
 * Lock recursion
 * --------------
 *
 * A write operation on a tree might indirectly start a look up on the same
 * tree.  This can happen when btrfs_cow_block locks the tree and needs to
 * lookup free extents.
 *
 * btrfs_cow_block
 *   ..
 *   alloc_tree_block_no_bg_flush
 *     btrfs_alloc_tree_block
 *       btrfs_reserve_extent
 *         ..
 *         load_free_space_cache
 *           ..
 *           btrfs_lookup_file_extent
 *             btrfs_search_slot
 *
 *
 * Locking pattern - spinning
 * --------------------------
 *
 * The simple locking scenario, the +--+ denotes the spinning section.
 *
 * +- btrfs_tree_lock
 * | - extent_buffer::rwlock is held
 * | - no heavy operations should happen, eg. IO, memory allocations, large
 * |   structure traversals
 * +- btrfs_tree_unock
*
*
 * Locking pattern - blocking
 * --------------------------
 *
 * The blocking write uses the following scheme.  The +--+ denotes the spinning
 * section.
 *
 * +- btrfs_tree_lock
 * |
 * +- btrfs_set_lock_blocking_write
 *
 *   - allowed: IO, memory allocations, etc.
 *
 * -- btrfs_tree_unlock - note, no explicit unblocking necessary
 *
 *
 * Blocking read is similar.
 *
 * +- btrfs_tree_read_lock
 * |
 * +- btrfs_set_lock_blocking_read
 *
 *  - heavy operations allowed
 *
 * +- btrfs_tree_read_unlock_blocking
 * |
 * +- btrfs_tree_read_unlock
 *
 */

#ifdef CONFIG_BTRFS_DEBUG
static inline void btrfs_assert_spinning_writers_get(struct extent_buffer *eb)
{
	WARN_ON(eb->spinning_writers);
	eb->spinning_writers++;
}

static inline void btrfs_assert_spinning_writers_put(struct extent_buffer *eb)
{
	WARN_ON(eb->spinning_writers != 1);
	eb->spinning_writers--;
}

static inline void btrfs_assert_no_spinning_writers(struct extent_buffer *eb)
{
	WARN_ON(eb->spinning_writers);
}

static inline void btrfs_assert_spinning_readers_get(struct extent_buffer *eb)
{
	atomic_inc(&eb->spinning_readers);
}

static inline void btrfs_assert_spinning_readers_put(struct extent_buffer *eb)
{
	WARN_ON(atomic_read(&eb->spinning_readers) == 0);
	atomic_dec(&eb->spinning_readers);
}

static inline void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb)
{
	atomic_inc(&eb->read_locks);
}

static inline void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb)
{
	atomic_dec(&eb->read_locks);
}

static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
{
	BUG_ON(!atomic_read(&eb->read_locks));
}

static inline void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb)
{
	eb->write_locks++;
}

static inline void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb)
{
	eb->write_locks--;
}

#else
static void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) { }
static void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) { }
static void btrfs_assert_no_spinning_writers(struct extent_buffer *eb) { }
static void btrfs_assert_spinning_readers_put(struct extent_buffer *eb) { }
static void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) { }
static void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { }
static void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb) { }
static void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb) { }
static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) { }
static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) { }
#endif

/*
 * Mark already held read lock as blocking. Can be nested in write lock by the
 * same thread.
 *
 * Use when there are potentially long operations ahead so other thread waiting
 * on the lock will not actively spin but sleep instead.
 *
 * The rwlock is released and blocking reader counter is increased.
 */
void btrfs_set_lock_blocking_read(struct extent_buffer *eb)
{
	trace_btrfs_set_lock_blocking_read(eb);
	/*
	 * No lock is required.  The lock owner may change if we have a read
	 * lock, but it won't change to or away from us.  If we have the write
	 * lock, we are the owner and it'll never change.
	 */
	if (eb->lock_recursed && current->pid == eb->lock_owner)
		return;
	btrfs_assert_tree_read_locked(eb);
	atomic_inc(&eb->blocking_readers);
	btrfs_assert_spinning_readers_put(eb);
	read_unlock(&eb->lock);
}

/*
 * Mark already held write lock as blocking.
 *
 * Use when there are potentially long operations ahead so other threads
 * waiting on the lock will not actively spin but sleep instead.
 *
 * The rwlock is released and blocking writers is set.
 */
void btrfs_set_lock_blocking_write(struct extent_buffer *eb)
{
	trace_btrfs_set_lock_blocking_write(eb);
	/*
	 * No lock is required.  The lock owner may change if we have a read
	 * lock, but it won't change to or away from us.  If we have the write
	 * lock, we are the owner and it'll never change.
	 */
	if (eb->lock_recursed && current->pid == eb->lock_owner)
		return;
	if (eb->blocking_writers == 0) {
		btrfs_assert_spinning_writers_put(eb);
		btrfs_assert_tree_locked(eb);
		WRITE_ONCE(eb->blocking_writers, 1);
		write_unlock(&eb->lock);
	}
}

/*
 * Lock the extent buffer for read. Wait for any writers (spinning or blocking).
 * Can be nested in write lock by the same thread.
 *
 * Use when the locked section does only lightweight actions and busy waiting
 * would be cheaper than making other threads do the wait/wake loop.
 *
 * The rwlock is held upon exit.
 */
void btrfs_tree_read_lock(struct extent_buffer *eb)
{
	u64 start_ns = 0;

	if (trace_btrfs_tree_read_lock_enabled())
		start_ns = ktime_get_ns();
again:
	read_lock(&eb->lock);
	BUG_ON(eb->blocking_writers == 0 &&
	       current->pid == eb->lock_owner);
	if (eb->blocking_writers) {
		if (current->pid == eb->lock_owner) {
			/*
			 * This extent is already write-locked by our thread.
			 * We allow an additional read lock to be added because
			 * it's for the same thread. btrfs_find_all_roots()
			 * depends on this as it may be called on a partly
			 * (write-)locked tree.
			 */
			BUG_ON(eb->lock_recursed);
			eb->lock_recursed = true;
			read_unlock(&eb->lock);
			trace_btrfs_tree_read_lock(eb, start_ns);
			return;
		}
		read_unlock(&eb->lock);
		wait_event(eb->write_lock_wq,
			   READ_ONCE(eb->blocking_writers) == 0);
		goto again;
	}
	btrfs_assert_tree_read_locks_get(eb);
	btrfs_assert_spinning_readers_get(eb);
	trace_btrfs_tree_read_lock(eb, start_ns);
}

/*
 * Lock extent buffer for read, optimistically expecting that there are no
 * contending blocking writers. If there are, don't wait.
 *
 * Return 1 if the rwlock has been taken, 0 otherwise
 */
int btrfs_tree_read_lock_atomic(struct extent_buffer *eb)
{
	if (READ_ONCE(eb->blocking_writers))
		return 0;

	read_lock(&eb->lock);
	/* Refetch value after lock */
	if (READ_ONCE(eb->blocking_writers)) {
		read_unlock(&eb->lock);
		return 0;
	}
	btrfs_assert_tree_read_locks_get(eb);
	btrfs_assert_spinning_readers_get(eb);
	trace_btrfs_tree_read_lock_atomic(eb);
	return 1;
}

/*
 * Try-lock for read. Don't block or wait for contending writers.
 *
 * Retrun 1 if the rwlock has been taken, 0 otherwise
 */
int btrfs_try_tree_read_lock(struct extent_buffer *eb)
{
	if (READ_ONCE(eb->blocking_writers))
		return 0;

	if (!read_trylock(&eb->lock))
		return 0;

	/* Refetch value after lock */
	if (READ_ONCE(eb->blocking_writers)) {
		read_unlock(&eb->lock);
		return 0;
	}
	btrfs_assert_tree_read_locks_get(eb);
	btrfs_assert_spinning_readers_get(eb);
	trace_btrfs_try_tree_read_lock(eb);
	return 1;
}

/*
 * Try-lock for write. May block until the lock is uncontended, but does not
 * wait until it is free.
 *
 * Retrun 1 if the rwlock has been taken, 0 otherwise
 */
int btrfs_try_tree_write_lock(struct extent_buffer *eb)
{
	if (READ_ONCE(eb->blocking_writers) || atomic_read(&eb->blocking_readers))
		return 0;

	write_lock(&eb->lock);
	/* Refetch value after lock */
	if (READ_ONCE(eb->blocking_writers) || atomic_read(&eb->blocking_readers)) {
		write_unlock(&eb->lock);
		return 0;
	}
	btrfs_assert_tree_write_locks_get(eb);
	btrfs_assert_spinning_writers_get(eb);
	eb->lock_owner = current->pid;
	trace_btrfs_try_tree_write_lock(eb);
	return 1;
}

/*
 * Release read lock. Must be used only if the lock is in spinning mode.  If
 * the read lock is nested, must pair with read lock before the write unlock.
 *
 * The rwlock is not held upon exit.
 */
void btrfs_tree_read_unlock(struct extent_buffer *eb)
{
	trace_btrfs_tree_read_unlock(eb);
	/*
	 * if we're nested, we have the write lock.  No new locking
	 * is needed as long as we are the lock owner.
	 * The write unlock will do a barrier for us, and the lock_recursed
	 * field only matters to the lock owner.
	 */
	if (eb->lock_recursed && current->pid == eb->lock_owner) {
		eb->lock_recursed = false;
		return;
	}
	btrfs_assert_tree_read_locked(eb);
	btrfs_assert_spinning_readers_put(eb);
	btrfs_assert_tree_read_locks_put(eb);
	read_unlock(&eb->lock);
}

/*
 * Release read lock, previously set to blocking by a pairing call to
 * btrfs_set_lock_blocking_read(). Can be nested in write lock by the same
 * thread.
 *
 * State of rwlock is unchanged, last reader wakes waiting threads.
 */
void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
{
	trace_btrfs_tree_read_unlock_blocking(eb);
	/*
	 * if we're nested, we have the write lock.  No new locking
	 * is needed as long as we are the lock owner.
	 * The write unlock will do a barrier for us, and the lock_recursed
	 * field only matters to the lock owner.
	 */
	if (eb->lock_recursed && current->pid == eb->lock_owner) {
		eb->lock_recursed = false;
		return;
	}
	btrfs_assert_tree_read_locked(eb);
	WARN_ON(atomic_read(&eb->blocking_readers) == 0);
	/* atomic_dec_and_test implies a barrier */
	if (atomic_dec_and_test(&eb->blocking_readers))
		cond_wake_up_nomb(&eb->read_lock_wq);
	btrfs_assert_tree_read_locks_put(eb);
}

/*
 * Lock for write. Wait for all blocking and spinning readers and writers. This
 * starts context where reader lock could be nested by the same thread.
 *
 * The rwlock is held for write upon exit.
 */
void btrfs_tree_lock(struct extent_buffer *eb)
	__acquires(&eb->lock)
{
	u64 start_ns = 0;

	if (trace_btrfs_tree_lock_enabled())
		start_ns = ktime_get_ns();

	WARN_ON(eb->lock_owner == current->pid);
again:
	wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
	wait_event(eb->write_lock_wq, READ_ONCE(eb->blocking_writers) == 0);
	write_lock(&eb->lock);
	/* Refetch value after lock */
	if (atomic_read(&eb->blocking_readers) ||
	    READ_ONCE(eb->blocking_writers)) {
		write_unlock(&eb->lock);
		goto again;
	}
	btrfs_assert_spinning_writers_get(eb);
	btrfs_assert_tree_write_locks_get(eb);
	eb->lock_owner = current->pid;
	trace_btrfs_tree_lock(eb, start_ns);
}

/*
 * Release the write lock, either blocking or spinning (ie. there's no need
 * for an explicit blocking unlock, like btrfs_tree_read_unlock_blocking).
 * This also ends the context for nesting, the read lock must have been
 * released already.
 *
 * Tasks blocked and waiting are woken, rwlock is not held upon exit.
 */
void btrfs_tree_unlock(struct extent_buffer *eb)
{
	/*
	 * This is read both locked and unlocked but always by the same thread
	 * that already owns the lock so we don't need to use READ_ONCE
	 */
	int blockers = eb->blocking_writers;

	BUG_ON(blockers > 1);

	btrfs_assert_tree_locked(eb);
	trace_btrfs_tree_unlock(eb);
	eb->lock_owner = 0;
	btrfs_assert_tree_write_locks_put(eb);

	if (blockers) {
		btrfs_assert_no_spinning_writers(eb);
		/* Unlocked write */
		WRITE_ONCE(eb->blocking_writers, 0);
		/*
		 * We need to order modifying blocking_writers above with
		 * actually waking up the sleepers to ensure they see the
		 * updated value of blocking_writers
		 */
		cond_wake_up(&eb->write_lock_wq);
	} else {
		btrfs_assert_spinning_writers_put(eb);
		write_unlock(&eb->lock);
	}
}

/*
 * Set all locked nodes in the path to blocking locks.  This should be done
 * before scheduling
 */
void btrfs_set_path_blocking(struct btrfs_path *p)
{
	int i;

	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
		if (!p->nodes[i] || !p->locks[i])
			continue;
		/*
		 * If we currently have a spinning reader or writer lock this
		 * will bump the count of blocking holders and drop the
		 * spinlock.
		 */
		if (p->locks[i] == BTRFS_READ_LOCK) {
			btrfs_set_lock_blocking_read(p->nodes[i]);
			p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
		} else if (p->locks[i] == BTRFS_WRITE_LOCK) {
			btrfs_set_lock_blocking_write(p->nodes[i]);
			p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
		}
	}
}

/*
 * This releases any locks held in the path starting at level and going all the
 * way up to the root.
 *
 * btrfs_search_slot will keep the lock held on higher nodes in a few corner
 * cases, such as COW of the block at slot zero in the node.  This ignores
 * those rules, and it should only be called when there are no more updates to
 * be done higher up in the tree.
 */
void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
{
	int i;

	if (path->keep_locks)
		return;

	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
		if (!path->nodes[i])
			continue;
		if (!path->locks[i])
			continue;
		btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
		path->locks[i] = 0;
	}
}

/*
 * Loop around taking references on and locking the root node of the tree until
 * we end up with a lock on the root node.
 *
 * Return: root extent buffer with write lock held
 */
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
{
	struct extent_buffer *eb;

	while (1) {
		eb = btrfs_root_node(root);
		btrfs_tree_lock(eb);
		if (eb == root->node)
			break;
		btrfs_tree_unlock(eb);
		free_extent_buffer(eb);
	}
	return eb;
}

/*
 * Loop around taking references on and locking the root node of the tree until
 * we end up with a lock on the root node.
 *
 * Return: root extent buffer with read lock held
 */
struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
{
	struct extent_buffer *eb;

	while (1) {
		eb = btrfs_root_node(root);
		btrfs_tree_read_lock(eb);
		if (eb == root->node)
			break;
		btrfs_tree_read_unlock(eb);
		free_extent_buffer(eb);
	}
	return eb;
}

/*
 * DREW locks
 * ==========
 *
 * DREW stands for double-reader-writer-exclusion lock. It's used in situation
 * where you want to provide A-B exclusion but not AA or BB.
 *
 * Currently implementation gives more priority to reader. If a reader and a
 * writer both race to acquire their respective sides of the lock the writer
 * would yield its lock as soon as it detects a concurrent reader. Additionally
 * if there are pending readers no new writers would be allowed to come in and
 * acquire the lock.
 */

int btrfs_drew_lock_init(struct btrfs_drew_lock *lock)
{
	int ret;

	ret = percpu_counter_init(&lock->writers, 0, GFP_KERNEL);
	if (ret)
		return ret;

	atomic_set(&lock->readers, 0);
	init_waitqueue_head(&lock->pending_readers);
	init_waitqueue_head(&lock->pending_writers);

	return 0;
}

void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock)
{
	percpu_counter_destroy(&lock->writers);
}

/* Return true if acquisition is successful, false otherwise */
bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock)
{
	if (atomic_read(&lock->readers))
		return false;

	percpu_counter_inc(&lock->writers);

	/* Ensure writers count is updated before we check for pending readers */
	smp_mb();
	if (atomic_read(&lock->readers)) {
		btrfs_drew_write_unlock(lock);
		return false;
	}

	return true;
}

void btrfs_drew_write_lock(struct btrfs_drew_lock *lock)
{
	while (true) {
		if (btrfs_drew_try_write_lock(lock))
			return;
		wait_event(lock->pending_writers, !atomic_read(&lock->readers));
	}
}

void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock)
{
	percpu_counter_dec(&lock->writers);
	cond_wake_up(&lock->pending_readers);
}

void btrfs_drew_read_lock(struct btrfs_drew_lock *lock)
{
	atomic_inc(&lock->readers);

	/*
	 * Ensure the pending reader count is perceieved BEFORE this reader
	 * goes to sleep in case of active writers. This guarantees new writers
	 * won't be allowed and that the current reader will be woken up when
	 * the last active writer finishes its jobs.
	 */
	smp_mb__after_atomic();

	wait_event(lock->pending_readers,
		   percpu_counter_sum(&lock->writers) == 0);
}

void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock)
{
	/*
	 * atomic_dec_and_test implies a full barrier, so woken up writers
	 * are guaranteed to see the decrement
	 */
	if (atomic_dec_and_test(&lock->readers))
		wake_up(&lock->pending_writers);
}