summaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_log_cil.c
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2022-03-30 04:22:02 +0300
committerDarrick J. Wong <djwong@kernel.org>2022-03-30 04:22:02 +0300
commit919edbadebe17a67193533f531c2920c03e40fa4 (patch)
tree199346ef6111b6685f9f242d4173f807bc9593b2 /fs/xfs/xfs_log_cil.c
parent5652ef31705f240e1528fe5a45d99229752e1ec8 (diff)
downloadlinux-919edbadebe17a67193533f531c2920c03e40fa4.tar.xz
xfs: drop async cache flushes from CIL commits.
Jan Kara reported a performance regression in dbench that he bisected down to commit bad77c375e8d ("xfs: CIL checkpoint flushes caches unconditionally"). Whilst developing the journal flush/fua optimisations this cache was part of, it appeared to made a significant difference to performance. However, now that this patchset has settled and all the correctness issues fixed, there does not appear to be any significant performance benefit to asynchronous cache flushes. In fact, the opposite is true on some storage types and workloads, where additional cache flushes that can occur from fsync heavy workloads have measurable and significant impact on overall throughput. Local dbench testing shows little difference on dbench runs with sync vs async cache flushes on either fast or slow SSD storage, and no difference in streaming concurrent async transaction workloads like fs-mark. Fast NVME storage. From `dbench -t 30`, CIL scale: clients async sync BW Latency BW Latency 1 935.18 0.855 915.64 0.903 8 2404.51 6.873 2341.77 6.511 16 3003.42 6.460 2931.57 6.529 32 3697.23 7.939 3596.28 7.894 128 7237.43 15.495 7217.74 11.588 512 5079.24 90.587 5167.08 95.822 fsmark, 32 threads, create w/ 64 byte xattr w/32k logbsize create chown unlink async 1m41s 1m16s 2m03s sync 1m40s 1m19s 1m54s Slower SATA SSD storage: From `dbench -t 30`, CIL scale: clients async sync BW Latency BW Latency 1 78.59 15.792 83.78 10.729 8 367.88 92.067 404.63 59.943 16 564.51 72.524 602.71 76.089 32 831.66 105.984 870.26 110.482 128 1659.76 102.969 1624.73 91.356 512 2135.91 223.054 2603.07 161.160 fsmark, 16 threads, create w/32k logbsize create unlink async 5m06s 4m15s sync 5m00s 4m22s And on Jan's test machine: 5.18-rc8-vanilla 5.18-rc8-patched Amean 1 71.22 ( 0.00%) 64.94 * 8.81%* Amean 2 93.03 ( 0.00%) 84.80 * 8.85%* Amean 4 150.54 ( 0.00%) 137.51 * 8.66%* Amean 8 252.53 ( 0.00%) 242.24 * 4.08%* Amean 16 454.13 ( 0.00%) 439.08 * 3.31%* Amean 32 835.24 ( 0.00%) 829.74 * 0.66%* Amean 64 1740.59 ( 0.00%) 1686.73 * 3.09%* Performance and cache flush behaviour is restored to pre-regression levels. As such, we can now consider the async cache flush mechanism an unnecessary exercise in premature optimisation and hence we can now remove it and the infrastructure it requires completely. Fixes: bad77c375e8d ("xfs: CIL checkpoint flushes caches unconditionally") Reported-and-tested-by: Jan Kara <jack@suse.cz> Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Diffstat (limited to 'fs/xfs/xfs_log_cil.c')
-rw-r--r--fs/xfs/xfs_log_cil.c42
1 files changed, 13 insertions, 29 deletions
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 767c386ed4ce..ba57323bfdce 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -705,11 +705,21 @@ xlog_cil_set_ctx_write_state(
* The LSN we need to pass to the log items on transaction
* commit is the LSN reported by the first log vector write, not
* the commit lsn. If we use the commit record lsn then we can
- * move the tail beyond the grant write head.
+ * move the grant write head beyond the tail LSN and overwrite
+ * it.
*/
ctx->start_lsn = lsn;
wake_up_all(&cil->xc_start_wait);
spin_unlock(&cil->xc_push_lock);
+
+ /*
+ * Make sure the metadata we are about to overwrite in the log
+ * has been flushed to stable storage before this iclog is
+ * issued.
+ */
+ spin_lock(&cil->xc_log->l_icloglock);
+ iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
+ spin_unlock(&cil->xc_log->l_icloglock);
return;
}
@@ -888,10 +898,7 @@ xlog_cil_push_work(
struct xfs_trans_header thdr;
struct xfs_log_iovec lhdr;
struct xfs_log_vec lvhdr = { NULL };
- xfs_lsn_t preflush_tail_lsn;
xfs_csn_t push_seq;
- struct bio bio;
- DECLARE_COMPLETION_ONSTACK(bdev_flush);
bool push_commit_stable;
new_ctx = xlog_cil_ctx_alloc();
@@ -962,23 +969,6 @@ xlog_cil_push_work(
spin_unlock(&cil->xc_push_lock);
/*
- * The CIL is stable at this point - nothing new will be added to it
- * because we hold the flush lock exclusively. Hence we can now issue
- * a cache flush to ensure all the completed metadata in the journal we
- * are about to overwrite is on stable storage.
- *
- * Because we are issuing this cache flush before we've written the
- * tail lsn to the iclog, we can have metadata IO completions move the
- * tail forwards between the completion of this flush and the iclog
- * being written. In this case, we need to re-issue the cache flush
- * before the iclog write. To detect whether the log tail moves, sample
- * the tail LSN *before* we issue the flush.
- */
- preflush_tail_lsn = atomic64_read(&log->l_tail_lsn);
- xfs_flush_bdev_async(&bio, log->l_mp->m_ddev_targp->bt_bdev,
- &bdev_flush);
-
- /*
* Pull all the log vectors off the items in the CIL, and remove the
* items from the CIL. We don't need the CIL lock here because it's only
* needed on the transaction commit side which is currently locked out
@@ -1054,12 +1044,6 @@ xlog_cil_push_work(
lvhdr.lv_iovecp = &lhdr;
lvhdr.lv_next = ctx->lv_chain;
- /*
- * Before we format and submit the first iclog, we have to ensure that
- * the metadata writeback ordering cache flush is complete.
- */
- wait_for_completion(&bdev_flush);
-
error = xlog_cil_write_chain(ctx, &lvhdr);
if (error)
goto out_abort_free_ticket;
@@ -1118,7 +1102,7 @@ xlog_cil_push_work(
if (push_commit_stable &&
ctx->commit_iclog->ic_state == XLOG_STATE_ACTIVE)
xlog_state_switch_iclogs(log, ctx->commit_iclog, 0);
- xlog_state_release_iclog(log, ctx->commit_iclog, preflush_tail_lsn);
+ xlog_state_release_iclog(log, ctx->commit_iclog);
/* Not safe to reference ctx now! */
@@ -1139,7 +1123,7 @@ out_abort_free_ticket:
return;
}
spin_lock(&log->l_icloglock);
- xlog_state_release_iclog(log, ctx->commit_iclog, 0);
+ xlog_state_release_iclog(log, ctx->commit_iclog);
/* Not safe to reference ctx now! */
spin_unlock(&log->l_icloglock);
}