1 files changed, 76 insertions, 67 deletions
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 6a9cefb635d6..0042d43b8e57 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -17,6 +17,8 @@
 #include <linux/kthread.h>
 #include <linux/sched/mm.h>
 
+static bool __should_discard_bucket(struct journal *, struct journal_device *);
+
 /* Free space calculations: */
 
 static unsigned journal_space_from(struct journal_device *ja,
@@ -81,18 +83,20 @@ static struct journal_space
 journal_dev_space_available(struct journal *j, struct bch_dev *ca,
 			    enum journal_space_from from)
 {
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct journal_device *ja = &ca->journal;
 	unsigned sectors, buckets, unwritten;
+	unsigned bucket_size_aligned = round_down(ca->mi.bucket_size, block_sectors(c));
 	u64 seq;
 
 	if (from == journal_space_total)
 		return (struct journal_space) {
-			.next_entry	= ca->mi.bucket_size,
-			.total		= ca->mi.bucket_size * ja->nr,
+			.next_entry	= bucket_size_aligned,
+			.total		= bucket_size_aligned * ja->nr,
 		};
 
 	buckets = bch2_journal_dev_buckets_available(j, ja, from);
-	sectors = ja->sectors_free;
+	sectors = round_down(ja->sectors_free, block_sectors(c));
 
 	/*
 	 * We that we don't allocate the space for a journal entry
@@ -107,7 +111,7 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca,
 			continue;
 
 		/* entry won't fit on this device, skip: */
-		if (unwritten > ca->mi.bucket_size)
+		if (unwritten > bucket_size_aligned)
 			continue;
 
 		if (unwritten >= sectors) {
@@ -117,7 +121,7 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca,
 			}
 
 			buckets--;
-			sectors = ca->mi.bucket_size;
+			sectors = bucket_size_aligned;
 		}
 
 		sectors -= unwritten;
@@ -125,12 +129,12 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca,
 
 	if (sectors < ca->mi.bucket_size && buckets) {
 		buckets--;
-		sectors = ca->mi.bucket_size;
+		sectors = bucket_size_aligned;
 	}
 
 	return (struct journal_space) {
 		.next_entry	= sectors,
-		.total		= sectors + buckets * ca->mi.bucket_size,
+		.total		= sectors + buckets * bucket_size_aligned,
 	};
 }
 
@@ -144,7 +148,6 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
 
 	BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
 
-	rcu_read_lock();
 	for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
 		if (!ca->journal.nr ||
 		    !ca->mi.durability)
@@ -162,12 +165,17 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
 
 		array_insert_item(dev_space, nr_devs, pos, space);
 	}
-	rcu_read_unlock();
 
 	if (nr_devs < nr_devs_want)
 		return (struct journal_space) { 0, 0 };
 
 	/*
+	 * It's possible for bucket size to be misaligned w.r.t. the filesystem
+	 * block size:
+	 */
+	min_bucket_size = round_down(min_bucket_size, block_sectors(c));
+
+	/*
 	 * We sorted largest to smallest, and we want the smallest out of the
 	 * @nr_devs_want largest devices:
 	 */
@@ -187,8 +195,8 @@ void bch2_journal_space_available(struct journal *j)
 	int ret = 0;
 
 	lockdep_assert_held(&j->lock);
+	guard(rcu)();
 
-	rcu_read_lock();
 	for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
 		struct journal_device *ja = &ca->journal;
 
@@ -203,30 +211,28 @@ void bch2_journal_space_available(struct journal *j)
 		       ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
 			ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
 
-		if (ja->discard_idx != ja->dirty_idx_ondisk)
-			can_discard = true;
+		can_discard |= __should_discard_bucket(j, ja);
 
 		max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
 		nr_online++;
 	}
-	rcu_read_unlock();
 
 	j->can_discard = can_discard;
 
 	if (nr_online < metadata_replicas_required(c)) {
-		struct printbuf buf = PRINTBUF;
-		buf.atomic++;
-		prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n"
-			   "rw journal devs:", nr_online, metadata_replicas_required(c));
-
-		rcu_read_lock();
-		for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal])
-			prt_printf(&buf, " %s", ca->name);
-		rcu_read_unlock();
-
-		bch_err(c, "%s", buf.buf);
-		printbuf_exit(&buf);
-		ret = JOURNAL_ERR_insufficient_devices;
+		if (!(c->sb.features & BIT_ULL(BCH_FEATURE_small_image))) {
+			struct printbuf buf = PRINTBUF;
+			buf.atomic++;
+			prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n"
+				   "rw journal devs:", nr_online, metadata_replicas_required(c));
+
+			for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal])
+				prt_printf(&buf, " %s", ca->name);
+
+			bch_err(c, "%s", buf.buf);
+			printbuf_exit(&buf);
+		}
+		ret = bch_err_throw(c, insufficient_journal_devices);
 		goto out;
 	}
 
@@ -240,7 +246,7 @@ void bch2_journal_space_available(struct journal *j)
 	total		= j->space[journal_space_total].total;
 
 	if (!j->space[journal_space_discarded].next_entry)
-		ret = JOURNAL_ERR_journal_full;
+		ret = bch_err_throw(c, journal_full);
 
 	if ((j->space[journal_space_clean_ondisk].next_entry <
 	     j->space[journal_space_clean_ondisk].total) &&
@@ -252,7 +258,9 @@ void bch2_journal_space_available(struct journal *j)
 
 	bch2_journal_set_watermark(j);
 out:
-	j->cur_entry_sectors	= !ret ? j->space[journal_space_discarded].next_entry : 0;
+	j->cur_entry_sectors	= !ret
+		? j->space[journal_space_discarded].next_entry
+		: 0;
 	j->cur_entry_error	= ret;
 
 	if (!ret)
@@ -261,12 +269,19 @@ out:
 
 /* Discards - last part of journal reclaim: */
 
-static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+static bool __should_discard_bucket(struct journal *j, struct journal_device *ja)
 {
-	bool ret;
+	unsigned min_free = max(4, ja->nr / 8);
 
+	return bch2_journal_dev_buckets_available(j, ja, journal_space_discarded) <
+		min_free &&
+		ja->discard_idx != ja->dirty_idx_ondisk;
+}
+
+static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+{
 	spin_lock(&j->lock);
-	ret = ja->discard_idx != ja->dirty_idx_ondisk;
+	bool ret = __should_discard_bucket(j, ja);
 	spin_unlock(&j->lock);
 
 	return ret;
@@ -282,12 +297,12 @@ void bch2_journal_do_discards(struct journal *j)
 
 	mutex_lock(&j->discard_lock);
 
-	for_each_rw_member(c, ca) {
+	for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_do_discards) {
 		struct journal_device *ja = &ca->journal;
 
 		while (should_discard_bucket(j, ja)) {
 			if (!c->opts.nochanges &&
-			    ca->mi.discard &&
+			    bch2_discard_opt_enabled(c, ca) &&
 			    bdev_max_discard_sectors(ca->disk_sb.bdev))
 				blkdev_issue_discard(ca->disk_sb.bdev,
 					bucket_to_sector(ca,
@@ -384,12 +399,16 @@ void bch2_journal_pin_drop(struct journal *j,
 	spin_unlock(&j->lock);
 }
 
-static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
+static enum journal_pin_type journal_pin_type(struct journal_entry_pin *pin,
+					      journal_pin_flush_fn fn)
 {
 	if (fn == bch2_btree_node_flush0 ||
-	    fn == bch2_btree_node_flush1)
-		return JOURNAL_PIN_TYPE_btree;
-	else if (fn == bch2_btree_key_cache_journal_flush)
+	    fn == bch2_btree_node_flush1) {
+		unsigned idx = fn == bch2_btree_node_flush1;
+		struct btree *b = container_of(pin, struct btree, writes[idx].journal);
+
+		return JOURNAL_PIN_TYPE_btree0 - b->c.level;
+	} else if (fn == bch2_btree_key_cache_journal_flush)
 		return JOURNAL_PIN_TYPE_key_cache;
 	else
 		return JOURNAL_PIN_TYPE_other;
@@ -441,7 +460,7 @@ void bch2_journal_pin_copy(struct journal *j,
 
 	bool reclaim = __journal_pin_drop(j, dst);
 
-	bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn));
+	bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(dst, flush_fn));
 
 	if (reclaim)
 		bch2_journal_reclaim_fast(j);
@@ -465,7 +484,7 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
 
 	bool reclaim = __journal_pin_drop(j, pin);
 
-	bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn));
+	bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn));
 
 	if (reclaim)
 		bch2_journal_reclaim_fast(j);
@@ -587,7 +606,7 @@ static size_t journal_flush_pins(struct journal *j,
 		spin_lock(&j->lock);
 		/* Pin might have been dropped or rearmed: */
 		if (likely(!err && !j->flush_in_progress_dropped))
-			list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(flush_fn)]);
+			list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(pin, flush_fn)]);
 		j->flush_in_progress = NULL;
 		j->flush_in_progress_dropped = false;
 		spin_unlock(&j->lock);
@@ -608,9 +627,10 @@ static u64 journal_seq_to_flush(struct journal *j)
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	u64 seq_to_flush = 0;
 
-	spin_lock(&j->lock);
+	guard(spinlock)(&j->lock);
+	guard(rcu)();
 
-	for_each_rw_member(c, ca) {
+	for_each_rw_member_rcu(c, ca) {
 		struct journal_device *ja = &ca->journal;
 		unsigned nr_buckets, bucket_to_flush;
 
@@ -620,20 +640,15 @@ static u64 journal_seq_to_flush(struct journal *j)
 		/* Try to keep the journal at most half full: */
 		nr_buckets = ja->nr / 2;
 
-		nr_buckets = min(nr_buckets, ja->nr);
-
 		bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
 		seq_to_flush = max(seq_to_flush,
 				   ja->bucket_seq[bucket_to_flush]);
 	}
 
 	/* Also flush if the pin fifo is more than half full */
-	seq_to_flush = max_t(s64, seq_to_flush,
-			     (s64) journal_cur_seq(j) -
-			     (j->pin.size >> 1));
-	spin_unlock(&j->lock);
-
-	return seq_to_flush;
+	return max_t(s64, seq_to_flush,
+		     (s64) journal_cur_seq(j) -
+		     (j->pin.size >> 1));
 }
 
 /**
@@ -641,7 +656,6 @@ static u64 journal_seq_to_flush(struct journal *j)
  * @j:		journal object
  * @direct:	direct or background reclaim?
  * @kicked:	requested to run since we last ran?
- * Returns:	0 on success, or -EIO if the journal has been shutdown
  *
  * Background journal reclaim writes out btree nodes. It should be run
  * early enough so that we never completely run out of journal buckets.
@@ -681,11 +695,11 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
 		if (kthread && kthread_should_stop())
 			break;
 
-		if (bch2_journal_error(j)) {
-			ret = -EIO;
+		ret = bch2_journal_error(j);
+		if (ret)
 			break;
-		}
 
+		/* XXX shove journal discards off to another thread */
 		bch2_journal_do_discards(j);
 
 		seq_to_flush = journal_seq_to_flush(j);
@@ -869,18 +883,13 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
 
 	mutex_lock(&j->reclaim_lock);
 
-	if (journal_flush_pins_or_still_flushing(j, seq_to_flush,
-			       BIT(JOURNAL_PIN_TYPE_key_cache)|
-			       BIT(JOURNAL_PIN_TYPE_other))) {
-		*did_work = true;
-		goto unlock;
-	}
-
-	if (journal_flush_pins_or_still_flushing(j, seq_to_flush,
-			       BIT(JOURNAL_PIN_TYPE_btree))) {
-		*did_work = true;
-		goto unlock;
-	}
+	for (int type = JOURNAL_PIN_TYPE_NR - 1;
+	     type >= 0;
+	     --type)
+		if (journal_flush_pins_or_still_flushing(j, seq_to_flush, BIT(type))) {
+			*did_work = true;
+			goto unlock;
+		}
 
 	if (seq_to_flush > journal_cur_seq(j))
 		bch2_journal_entry_close(j);
@@ -953,7 +962,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
 	seq = 0;
 	spin_lock(&j->lock);
 	while (!ret) {
-		struct bch_replicas_padded replicas;
+		union bch_replicas_padded replicas;
 
 		seq = max(seq, journal_last_seq(j));
 		if (seq >= j->pin.back)