diff options
Diffstat (limited to 'fs/bcachefs/journal_reclaim.c')
-rw-r--r-- | fs/bcachefs/journal_reclaim.c | 143 |
1 files changed, 76 insertions, 67 deletions
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 6a9cefb635d6..0042d43b8e57 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -17,6 +17,8 @@ #include <linux/kthread.h> #include <linux/sched/mm.h> +static bool __should_discard_bucket(struct journal *, struct journal_device *); + /* Free space calculations: */ static unsigned journal_space_from(struct journal_device *ja, @@ -81,18 +83,20 @@ static struct journal_space journal_dev_space_available(struct journal *j, struct bch_dev *ca, enum journal_space_from from) { + struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_device *ja = &ca->journal; unsigned sectors, buckets, unwritten; + unsigned bucket_size_aligned = round_down(ca->mi.bucket_size, block_sectors(c)); u64 seq; if (from == journal_space_total) return (struct journal_space) { - .next_entry = ca->mi.bucket_size, - .total = ca->mi.bucket_size * ja->nr, + .next_entry = bucket_size_aligned, + .total = bucket_size_aligned * ja->nr, }; buckets = bch2_journal_dev_buckets_available(j, ja, from); - sectors = ja->sectors_free; + sectors = round_down(ja->sectors_free, block_sectors(c)); /* * We that we don't allocate the space for a journal entry @@ -107,7 +111,7 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca, continue; /* entry won't fit on this device, skip: */ - if (unwritten > ca->mi.bucket_size) + if (unwritten > bucket_size_aligned) continue; if (unwritten >= sectors) { @@ -117,7 +121,7 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca, } buckets--; - sectors = ca->mi.bucket_size; + sectors = bucket_size_aligned; } sectors -= unwritten; @@ -125,12 +129,12 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca, if (sectors < ca->mi.bucket_size && buckets) { buckets--; - sectors = ca->mi.bucket_size; + sectors = bucket_size_aligned; } return (struct journal_space) { .next_entry = sectors, - .total = sectors + buckets * ca->mi.bucket_size, + .total = sectors + buckets * bucket_size_aligned, }; } @@ -144,7 +148,6 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space)); - rcu_read_lock(); for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { if (!ca->journal.nr || !ca->mi.durability) @@ -162,12 +165,17 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne array_insert_item(dev_space, nr_devs, pos, space); } - rcu_read_unlock(); if (nr_devs < nr_devs_want) return (struct journal_space) { 0, 0 }; /* + * It's possible for bucket size to be misaligned w.r.t. the filesystem + * block size: + */ + min_bucket_size = round_down(min_bucket_size, block_sectors(c)); + + /* * We sorted largest to smallest, and we want the smallest out of the * @nr_devs_want largest devices: */ @@ -187,8 +195,8 @@ void bch2_journal_space_available(struct journal *j) int ret = 0; lockdep_assert_held(&j->lock); + guard(rcu)(); - rcu_read_lock(); for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { struct journal_device *ja = &ca->journal; @@ -203,30 +211,28 @@ void bch2_journal_space_available(struct journal *j) ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk) ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; - if (ja->discard_idx != ja->dirty_idx_ondisk) - can_discard = true; + can_discard |= __should_discard_bucket(j, ja); max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size); nr_online++; } - rcu_read_unlock(); j->can_discard = can_discard; if (nr_online < metadata_replicas_required(c)) { - struct printbuf buf = PRINTBUF; - buf.atomic++; - prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n" - "rw journal devs:", nr_online, metadata_replicas_required(c)); - - rcu_read_lock(); - for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) - prt_printf(&buf, " %s", ca->name); - rcu_read_unlock(); - - bch_err(c, "%s", buf.buf); - printbuf_exit(&buf); - ret = JOURNAL_ERR_insufficient_devices; + if (!(c->sb.features & BIT_ULL(BCH_FEATURE_small_image))) { + struct printbuf buf = PRINTBUF; + buf.atomic++; + prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n" + "rw journal devs:", nr_online, metadata_replicas_required(c)); + + for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) + prt_printf(&buf, " %s", ca->name); + + bch_err(c, "%s", buf.buf); + printbuf_exit(&buf); + } + ret = bch_err_throw(c, insufficient_journal_devices); goto out; } @@ -240,7 +246,7 @@ void bch2_journal_space_available(struct journal *j) total = j->space[journal_space_total].total; if (!j->space[journal_space_discarded].next_entry) - ret = JOURNAL_ERR_journal_full; + ret = bch_err_throw(c, journal_full); if ((j->space[journal_space_clean_ondisk].next_entry < j->space[journal_space_clean_ondisk].total) && @@ -252,7 +258,9 @@ void bch2_journal_space_available(struct journal *j) bch2_journal_set_watermark(j); out: - j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; + j->cur_entry_sectors = !ret + ? j->space[journal_space_discarded].next_entry + : 0; j->cur_entry_error = ret; if (!ret) @@ -261,12 +269,19 @@ out: /* Discards - last part of journal reclaim: */ -static bool should_discard_bucket(struct journal *j, struct journal_device *ja) +static bool __should_discard_bucket(struct journal *j, struct journal_device *ja) { - bool ret; + unsigned min_free = max(4, ja->nr / 8); + return bch2_journal_dev_buckets_available(j, ja, journal_space_discarded) < + min_free && + ja->discard_idx != ja->dirty_idx_ondisk; +} + +static bool should_discard_bucket(struct journal *j, struct journal_device *ja) +{ spin_lock(&j->lock); - ret = ja->discard_idx != ja->dirty_idx_ondisk; + bool ret = __should_discard_bucket(j, ja); spin_unlock(&j->lock); return ret; @@ -282,12 +297,12 @@ void bch2_journal_do_discards(struct journal *j) mutex_lock(&j->discard_lock); - for_each_rw_member(c, ca) { + for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_do_discards) { struct journal_device *ja = &ca->journal; while (should_discard_bucket(j, ja)) { if (!c->opts.nochanges && - ca->mi.discard && + bch2_discard_opt_enabled(c, ca) && bdev_max_discard_sectors(ca->disk_sb.bdev)) blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, @@ -384,12 +399,16 @@ void bch2_journal_pin_drop(struct journal *j, spin_unlock(&j->lock); } -static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn) +static enum journal_pin_type journal_pin_type(struct journal_entry_pin *pin, + journal_pin_flush_fn fn) { if (fn == bch2_btree_node_flush0 || - fn == bch2_btree_node_flush1) - return JOURNAL_PIN_TYPE_btree; - else if (fn == bch2_btree_key_cache_journal_flush) + fn == bch2_btree_node_flush1) { + unsigned idx = fn == bch2_btree_node_flush1; + struct btree *b = container_of(pin, struct btree, writes[idx].journal); + + return JOURNAL_PIN_TYPE_btree0 - b->c.level; + } else if (fn == bch2_btree_key_cache_journal_flush) return JOURNAL_PIN_TYPE_key_cache; else return JOURNAL_PIN_TYPE_other; @@ -441,7 +460,7 @@ void bch2_journal_pin_copy(struct journal *j, bool reclaim = __journal_pin_drop(j, dst); - bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn)); + bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(dst, flush_fn)); if (reclaim) bch2_journal_reclaim_fast(j); @@ -465,7 +484,7 @@ void bch2_journal_pin_set(struct journal *j, u64 seq, bool reclaim = __journal_pin_drop(j, pin); - bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn)); + bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn)); if (reclaim) bch2_journal_reclaim_fast(j); @@ -587,7 +606,7 @@ static size_t journal_flush_pins(struct journal *j, spin_lock(&j->lock); /* Pin might have been dropped or rearmed: */ if (likely(!err && !j->flush_in_progress_dropped)) - list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(flush_fn)]); + list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(pin, flush_fn)]); j->flush_in_progress = NULL; j->flush_in_progress_dropped = false; spin_unlock(&j->lock); @@ -608,9 +627,10 @@ static u64 journal_seq_to_flush(struct journal *j) struct bch_fs *c = container_of(j, struct bch_fs, journal); u64 seq_to_flush = 0; - spin_lock(&j->lock); + guard(spinlock)(&j->lock); + guard(rcu)(); - for_each_rw_member(c, ca) { + for_each_rw_member_rcu(c, ca) { struct journal_device *ja = &ca->journal; unsigned nr_buckets, bucket_to_flush; @@ -620,20 +640,15 @@ static u64 journal_seq_to_flush(struct journal *j) /* Try to keep the journal at most half full: */ nr_buckets = ja->nr / 2; - nr_buckets = min(nr_buckets, ja->nr); - bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr; seq_to_flush = max(seq_to_flush, ja->bucket_seq[bucket_to_flush]); } /* Also flush if the pin fifo is more than half full */ - seq_to_flush = max_t(s64, seq_to_flush, - (s64) journal_cur_seq(j) - - (j->pin.size >> 1)); - spin_unlock(&j->lock); - - return seq_to_flush; + return max_t(s64, seq_to_flush, + (s64) journal_cur_seq(j) - + (j->pin.size >> 1)); } /** @@ -641,7 +656,6 @@ static u64 journal_seq_to_flush(struct journal *j) * @j: journal object * @direct: direct or background reclaim? * @kicked: requested to run since we last ran? - * Returns: 0 on success, or -EIO if the journal has been shutdown * * Background journal reclaim writes out btree nodes. It should be run * early enough so that we never completely run out of journal buckets. @@ -681,11 +695,11 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) if (kthread && kthread_should_stop()) break; - if (bch2_journal_error(j)) { - ret = -EIO; + ret = bch2_journal_error(j); + if (ret) break; - } + /* XXX shove journal discards off to another thread */ bch2_journal_do_discards(j); seq_to_flush = journal_seq_to_flush(j); @@ -869,18 +883,13 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, mutex_lock(&j->reclaim_lock); - if (journal_flush_pins_or_still_flushing(j, seq_to_flush, - BIT(JOURNAL_PIN_TYPE_key_cache)| - BIT(JOURNAL_PIN_TYPE_other))) { - *did_work = true; - goto unlock; - } - - if (journal_flush_pins_or_still_flushing(j, seq_to_flush, - BIT(JOURNAL_PIN_TYPE_btree))) { - *did_work = true; - goto unlock; - } + for (int type = JOURNAL_PIN_TYPE_NR - 1; + type >= 0; + --type) + if (journal_flush_pins_or_still_flushing(j, seq_to_flush, BIT(type))) { + *did_work = true; + goto unlock; + } if (seq_to_flush > journal_cur_seq(j)) bch2_journal_entry_close(j); @@ -953,7 +962,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) seq = 0; spin_lock(&j->lock); while (!ret) { - struct bch_replicas_padded replicas; + union bch_replicas_padded replicas; seq = max(seq, journal_last_seq(j)); if (seq >= j->pin.back) |