summaryrefslogtreecommitdiff
path: root/fs/bcachefs/journal_reclaim.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/bcachefs/journal_reclaim.c')
-rw-r--r--fs/bcachefs/journal_reclaim.c143
1 files changed, 76 insertions, 67 deletions
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 6a9cefb635d6..0042d43b8e57 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -17,6 +17,8 @@
#include <linux/kthread.h>
#include <linux/sched/mm.h>
+static bool __should_discard_bucket(struct journal *, struct journal_device *);
+
/* Free space calculations: */
static unsigned journal_space_from(struct journal_device *ja,
@@ -81,18 +83,20 @@ static struct journal_space
journal_dev_space_available(struct journal *j, struct bch_dev *ca,
enum journal_space_from from)
{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_device *ja = &ca->journal;
unsigned sectors, buckets, unwritten;
+ unsigned bucket_size_aligned = round_down(ca->mi.bucket_size, block_sectors(c));
u64 seq;
if (from == journal_space_total)
return (struct journal_space) {
- .next_entry = ca->mi.bucket_size,
- .total = ca->mi.bucket_size * ja->nr,
+ .next_entry = bucket_size_aligned,
+ .total = bucket_size_aligned * ja->nr,
};
buckets = bch2_journal_dev_buckets_available(j, ja, from);
- sectors = ja->sectors_free;
+ sectors = round_down(ja->sectors_free, block_sectors(c));
/*
* We that we don't allocate the space for a journal entry
@@ -107,7 +111,7 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca,
continue;
/* entry won't fit on this device, skip: */
- if (unwritten > ca->mi.bucket_size)
+ if (unwritten > bucket_size_aligned)
continue;
if (unwritten >= sectors) {
@@ -117,7 +121,7 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca,
}
buckets--;
- sectors = ca->mi.bucket_size;
+ sectors = bucket_size_aligned;
}
sectors -= unwritten;
@@ -125,12 +129,12 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca,
if (sectors < ca->mi.bucket_size && buckets) {
buckets--;
- sectors = ca->mi.bucket_size;
+ sectors = bucket_size_aligned;
}
return (struct journal_space) {
.next_entry = sectors,
- .total = sectors + buckets * ca->mi.bucket_size,
+ .total = sectors + buckets * bucket_size_aligned,
};
}
@@ -144,7 +148,6 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
- rcu_read_lock();
for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
if (!ca->journal.nr ||
!ca->mi.durability)
@@ -162,12 +165,17 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
array_insert_item(dev_space, nr_devs, pos, space);
}
- rcu_read_unlock();
if (nr_devs < nr_devs_want)
return (struct journal_space) { 0, 0 };
/*
+ * It's possible for bucket size to be misaligned w.r.t. the filesystem
+ * block size:
+ */
+ min_bucket_size = round_down(min_bucket_size, block_sectors(c));
+
+ /*
* We sorted largest to smallest, and we want the smallest out of the
* @nr_devs_want largest devices:
*/
@@ -187,8 +195,8 @@ void bch2_journal_space_available(struct journal *j)
int ret = 0;
lockdep_assert_held(&j->lock);
+ guard(rcu)();
- rcu_read_lock();
for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
struct journal_device *ja = &ca->journal;
@@ -203,30 +211,28 @@ void bch2_journal_space_available(struct journal *j)
ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
- if (ja->discard_idx != ja->dirty_idx_ondisk)
- can_discard = true;
+ can_discard |= __should_discard_bucket(j, ja);
max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
nr_online++;
}
- rcu_read_unlock();
j->can_discard = can_discard;
if (nr_online < metadata_replicas_required(c)) {
- struct printbuf buf = PRINTBUF;
- buf.atomic++;
- prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n"
- "rw journal devs:", nr_online, metadata_replicas_required(c));
-
- rcu_read_lock();
- for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal])
- prt_printf(&buf, " %s", ca->name);
- rcu_read_unlock();
-
- bch_err(c, "%s", buf.buf);
- printbuf_exit(&buf);
- ret = JOURNAL_ERR_insufficient_devices;
+ if (!(c->sb.features & BIT_ULL(BCH_FEATURE_small_image))) {
+ struct printbuf buf = PRINTBUF;
+ buf.atomic++;
+ prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n"
+ "rw journal devs:", nr_online, metadata_replicas_required(c));
+
+ for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal])
+ prt_printf(&buf, " %s", ca->name);
+
+ bch_err(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+ ret = bch_err_throw(c, insufficient_journal_devices);
goto out;
}
@@ -240,7 +246,7 @@ void bch2_journal_space_available(struct journal *j)
total = j->space[journal_space_total].total;
if (!j->space[journal_space_discarded].next_entry)
- ret = JOURNAL_ERR_journal_full;
+ ret = bch_err_throw(c, journal_full);
if ((j->space[journal_space_clean_ondisk].next_entry <
j->space[journal_space_clean_ondisk].total) &&
@@ -252,7 +258,9 @@ void bch2_journal_space_available(struct journal *j)
bch2_journal_set_watermark(j);
out:
- j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0;
+ j->cur_entry_sectors = !ret
+ ? j->space[journal_space_discarded].next_entry
+ : 0;
j->cur_entry_error = ret;
if (!ret)
@@ -261,12 +269,19 @@ out:
/* Discards - last part of journal reclaim: */
-static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+static bool __should_discard_bucket(struct journal *j, struct journal_device *ja)
{
- bool ret;
+ unsigned min_free = max(4, ja->nr / 8);
+ return bch2_journal_dev_buckets_available(j, ja, journal_space_discarded) <
+ min_free &&
+ ja->discard_idx != ja->dirty_idx_ondisk;
+}
+
+static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+{
spin_lock(&j->lock);
- ret = ja->discard_idx != ja->dirty_idx_ondisk;
+ bool ret = __should_discard_bucket(j, ja);
spin_unlock(&j->lock);
return ret;
@@ -282,12 +297,12 @@ void bch2_journal_do_discards(struct journal *j)
mutex_lock(&j->discard_lock);
- for_each_rw_member(c, ca) {
+ for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_do_discards) {
struct journal_device *ja = &ca->journal;
while (should_discard_bucket(j, ja)) {
if (!c->opts.nochanges &&
- ca->mi.discard &&
+ bch2_discard_opt_enabled(c, ca) &&
bdev_max_discard_sectors(ca->disk_sb.bdev))
blkdev_issue_discard(ca->disk_sb.bdev,
bucket_to_sector(ca,
@@ -384,12 +399,16 @@ void bch2_journal_pin_drop(struct journal *j,
spin_unlock(&j->lock);
}
-static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
+static enum journal_pin_type journal_pin_type(struct journal_entry_pin *pin,
+ journal_pin_flush_fn fn)
{
if (fn == bch2_btree_node_flush0 ||
- fn == bch2_btree_node_flush1)
- return JOURNAL_PIN_TYPE_btree;
- else if (fn == bch2_btree_key_cache_journal_flush)
+ fn == bch2_btree_node_flush1) {
+ unsigned idx = fn == bch2_btree_node_flush1;
+ struct btree *b = container_of(pin, struct btree, writes[idx].journal);
+
+ return JOURNAL_PIN_TYPE_btree0 - b->c.level;
+ } else if (fn == bch2_btree_key_cache_journal_flush)
return JOURNAL_PIN_TYPE_key_cache;
else
return JOURNAL_PIN_TYPE_other;
@@ -441,7 +460,7 @@ void bch2_journal_pin_copy(struct journal *j,
bool reclaim = __journal_pin_drop(j, dst);
- bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn));
+ bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(dst, flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
@@ -465,7 +484,7 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
bool reclaim = __journal_pin_drop(j, pin);
- bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn));
+ bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
@@ -587,7 +606,7 @@ static size_t journal_flush_pins(struct journal *j,
spin_lock(&j->lock);
/* Pin might have been dropped or rearmed: */
if (likely(!err && !j->flush_in_progress_dropped))
- list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(flush_fn)]);
+ list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(pin, flush_fn)]);
j->flush_in_progress = NULL;
j->flush_in_progress_dropped = false;
spin_unlock(&j->lock);
@@ -608,9 +627,10 @@ static u64 journal_seq_to_flush(struct journal *j)
struct bch_fs *c = container_of(j, struct bch_fs, journal);
u64 seq_to_flush = 0;
- spin_lock(&j->lock);
+ guard(spinlock)(&j->lock);
+ guard(rcu)();
- for_each_rw_member(c, ca) {
+ for_each_rw_member_rcu(c, ca) {
struct journal_device *ja = &ca->journal;
unsigned nr_buckets, bucket_to_flush;
@@ -620,20 +640,15 @@ static u64 journal_seq_to_flush(struct journal *j)
/* Try to keep the journal at most half full: */
nr_buckets = ja->nr / 2;
- nr_buckets = min(nr_buckets, ja->nr);
-
bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
seq_to_flush = max(seq_to_flush,
ja->bucket_seq[bucket_to_flush]);
}
/* Also flush if the pin fifo is more than half full */
- seq_to_flush = max_t(s64, seq_to_flush,
- (s64) journal_cur_seq(j) -
- (j->pin.size >> 1));
- spin_unlock(&j->lock);
-
- return seq_to_flush;
+ return max_t(s64, seq_to_flush,
+ (s64) journal_cur_seq(j) -
+ (j->pin.size >> 1));
}
/**
@@ -641,7 +656,6 @@ static u64 journal_seq_to_flush(struct journal *j)
* @j: journal object
* @direct: direct or background reclaim?
* @kicked: requested to run since we last ran?
- * Returns: 0 on success, or -EIO if the journal has been shutdown
*
* Background journal reclaim writes out btree nodes. It should be run
* early enough so that we never completely run out of journal buckets.
@@ -681,11 +695,11 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
if (kthread && kthread_should_stop())
break;
- if (bch2_journal_error(j)) {
- ret = -EIO;
+ ret = bch2_journal_error(j);
+ if (ret)
break;
- }
+ /* XXX shove journal discards off to another thread */
bch2_journal_do_discards(j);
seq_to_flush = journal_seq_to_flush(j);
@@ -869,18 +883,13 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
mutex_lock(&j->reclaim_lock);
- if (journal_flush_pins_or_still_flushing(j, seq_to_flush,
- BIT(JOURNAL_PIN_TYPE_key_cache)|
- BIT(JOURNAL_PIN_TYPE_other))) {
- *did_work = true;
- goto unlock;
- }
-
- if (journal_flush_pins_or_still_flushing(j, seq_to_flush,
- BIT(JOURNAL_PIN_TYPE_btree))) {
- *did_work = true;
- goto unlock;
- }
+ for (int type = JOURNAL_PIN_TYPE_NR - 1;
+ type >= 0;
+ --type)
+ if (journal_flush_pins_or_still_flushing(j, seq_to_flush, BIT(type))) {
+ *did_work = true;
+ goto unlock;
+ }
if (seq_to_flush > journal_cur_seq(j))
bch2_journal_entry_close(j);
@@ -953,7 +962,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
seq = 0;
spin_lock(&j->lock);
while (!ret) {
- struct bch_replicas_padded replicas;
+ union bch_replicas_padded replicas;
seq = max(seq, journal_last_seq(j));
if (seq >= j->pin.back)