From d5cc2e3f968ff60f247fdef15b04fac788ef46d2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 16 Apr 2015 21:59:07 +1000 Subject: xfs: DIO needs an ioend for writes Currently we can only tell DIO completion that an IO requires unwritten extent completion. This is done by a hacky non-null private pointer passed to Io completion, but the private pointer does not actually contain any information that is used. We also need to pass to IO completion the fact that the IO may be beyond EOF and so a size update transaction needs to be done. This is currently determined by checks in the io completion, but we need to determine if this is necessary at block mapping time as we need to defer the size update transactions to a completion workqueue, just like unwritten extent conversion. To do this, first we need to allocate and pass an ioend to to IO completion. Add this for unwritten extent conversion; we'll do the EOF updates in the next commit. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_trace.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/xfs/xfs_trace.h') diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 51372e34d988..2de8556ffac2 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1217,6 +1217,9 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_found); DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc); DEFINE_IOMAP_EVENT(xfs_get_blocks_found); DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update); DECLARE_EVENT_CLASS(xfs_simple_io_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), -- cgit v1.2.3 From 6dfa1b67e3b3a9bf536e2fb9ed99001c219822a5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 16 Apr 2015 21:59:34 +1000 Subject: xfs: handle DIO overwrite EOF update completion correctly Currently a DIO overwrite that extends the EOF (e.g sub-block IO or write into allocated blocks beyond EOF) requires a transaction for the EOF update. Thi is done in IO completion context, but we aren't explicitly handling this situation properly and so it can run in interrupt context. Ensure that we defer IO that spans EOF correctly to the DIO completion workqueue, and now that we have an ioend in IO completion we can use the common ioend completion path to do all the work. Note: we do not preallocate the append transaction as we can have multiple mapping and allocation calls per direct IO. hence preallocating can still leave us with nested transactions by attempting to map and allocate more blocks after we've preallocated an append transaction. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 61 +++++++++++++++++++++++++++--------------------------- fs/xfs/xfs_trace.h | 1 + 2 files changed, 31 insertions(+), 31 deletions(-) (limited to 'fs/xfs/xfs_trace.h') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 60d6466d72f6..a59443db1de9 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1293,7 +1293,7 @@ xfs_map_direct( imap); } - if (ioend->io_type == XFS_IO_UNWRITTEN) + if (ioend->io_type == XFS_IO_UNWRITTEN || xfs_ioend_is_append(ioend)) set_buffer_defer_completion(bh_result); } @@ -1535,8 +1535,10 @@ xfs_end_io_direct_write( struct xfs_mount *mp = ip->i_mount; struct xfs_ioend *ioend = private; + trace_xfs_gbmap_direct_endio(ip, offset, size, ioend->io_type, NULL); + if (XFS_FORCED_SHUTDOWN(mp)) - goto out_destroy_ioend; + goto out_end_io; /* * dio completion end_io functions are only called on writes if more @@ -1557,40 +1559,37 @@ xfs_end_io_direct_write( ioend->io_offset = offset; /* - * While the generic direct I/O code updates the inode size, it does - * so only after the end_io handler is called, which means our - * end_io handler thinks the on-disk size is outside the in-core - * size. To prevent this just update it a little bit earlier here. + * The ioend tells us whether we are doing unwritten extent conversion + * or an append transaction that updates the on-disk file size. These + * cases are the only cases where we should *potentially* be needing + * to update the VFS inode size. When the ioend indicates this, we + * are *guaranteed* to be running in non-interrupt context. + * + * We need to update the in-core inode size here so that we don't end up + * with the on-disk inode size being outside the in-core inode size. + * While we can do this in the process context after the IO has + * completed, this does not work for AIO and hence we always update + * the in-core inode size here if necessary. */ - if (offset + size > i_size_read(inode)) - i_size_write(inode, offset + size); + if (ioend->io_type == XFS_IO_UNWRITTEN || xfs_ioend_is_append(ioend)) { + if (offset + size > i_size_read(inode)) + i_size_write(inode, offset + size); + } else + ASSERT(offset + size <= i_size_read(inode)); /* - * For direct I/O we do not know if we need to allocate blocks or not, - * so we can't preallocate an append transaction, as that results in - * nested reservations and log space deadlocks. Hence allocate the - * transaction here. While this is sub-optimal and can block IO - * completion for some time, we're stuck with doing it this way until - * we can pass the ioend to the direct IO allocation callbacks and - * avoid nesting that way. + * If we are doing an append IO that needs to update the EOF on disk, + * do the transaction reserve now so we can use common end io + * processing. Stashing the error (if there is one) in the ioend will + * result in the ioend processing passing on the error if it is + * possible as we can't return it from here. */ - if (ioend->io_type == XFS_IO_UNWRITTEN) { - xfs_iomap_write_unwritten(ip, offset, size); - } else if (offset + size > ip->i_d.di_size) { - struct xfs_trans *tp; - int error; - - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - goto out_destroy_ioend; - } + if (ioend->io_type == XFS_IO_OVERWRITE && xfs_ioend_is_append(ioend)) + ioend->io_error = xfs_setfilesize_trans_alloc(ioend); - xfs_setfilesize(ip, tp, offset, size); - } -out_destroy_ioend: - xfs_destroy_ioend(ioend); +out_end_io: + xfs_end_io(&ioend->io_work); + return; } STATIC ssize_t diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 2de8556ffac2..0ae50e9847bb 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1220,6 +1220,7 @@ DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc); DEFINE_IOMAP_EVENT(xfs_gbmap_direct); DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new); DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio); DECLARE_EVENT_CLASS(xfs_simple_io_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), -- cgit v1.2.3 From a06c277a13c3620c8ee9304891758f2fcff9c4a4 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 16 Apr 2015 22:00:00 +1000 Subject: xfs: DIO writes within EOF don't need an ioend DIO writes that lie entirely within EOF have nothing to do in IO completion. In this case, we don't need no steekin' ioend, and so we can avoid allocating an ioend until we have a mapping that spans EOF. This means that IO completion has two contexts - deferred completion to the dio workqueue that uses an ioend, and interrupt completion that does nothing because there is nothing that can be done in this context. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 69 ++++++++++++++++++++++++++++++------------------------ fs/xfs/xfs_trace.h | 1 + 2 files changed, 40 insertions(+), 30 deletions(-) (limited to 'fs/xfs/xfs_trace.h') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index a59443db1de9..c02a47453137 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1234,15 +1234,19 @@ xfs_vm_releasepage( } /* - * When we map a DIO buffer, we need to attach an ioend that describes the type - * of write IO we are doing. This passes to the completion function the - * operations it needs to perform. + * When we map a DIO buffer, we may need to attach an ioend that describes the + * type of write IO we are doing. This passes to the completion function the + * operations it needs to perform. If the mapping is for an overwrite wholly + * within the EOF then we don't need an ioend and so we don't allocate one. + * This avoids the unnecessary overhead of allocating and freeing ioends for + * workloads that don't require transactions on IO completion. * * If we get multiple mappings in a single IO, we might be mapping different * types. But because the direct IO can only have a single private pointer, we * need to ensure that: * - * a) the ioend spans the entire region of the IO; and + * a) i) the ioend spans the entire region of unwritten mappings; or + * ii) the ioend spans all the mappings that cross or are beyond EOF; and * b) if it contains unwritten extents, it is *permanently* marked as such * * We could do this by chaining ioends like buffered IO does, but we only @@ -1283,21 +1287,23 @@ xfs_map_direct( trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset, ioend->io_size, ioend->io_type, imap); - } else { + } else if (type == XFS_IO_UNWRITTEN || + offset + size > i_size_read(inode)) { ioend = xfs_alloc_ioend(inode, type); ioend->io_offset = offset; ioend->io_size = size; + bh_result->b_private = ioend; + set_buffer_defer_completion(bh_result); trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type, imap); + } else { + trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type, + imap); } - - if (ioend->io_type == XFS_IO_UNWRITTEN || xfs_ioend_is_append(ioend)) - set_buffer_defer_completion(bh_result); } - /* * If this is O_DIRECT or the mpage code calling tell them how large the mapping * is, so that we can avoid repeated get_blocks calls. @@ -1519,9 +1525,11 @@ xfs_get_blocks_direct( /* * Complete a direct I/O write request. * - * If the private argument is non-NULL __xfs_get_blocks signals us that we - * need to issue a transaction to convert the range from unwritten to written - * extents. + * The ioend structure is passed from __xfs_get_blocks() to tell us what to do. + * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite + * wholly within the EOF and so there is nothing for us to do. Note that in this + * case the completion can be called in interrupt context, whereas if we have an + * ioend we will always be called in task context (i.e. from a workqueue). */ STATIC void xfs_end_io_direct_write( @@ -1535,7 +1543,13 @@ xfs_end_io_direct_write( struct xfs_mount *mp = ip->i_mount; struct xfs_ioend *ioend = private; - trace_xfs_gbmap_direct_endio(ip, offset, size, ioend->io_type, NULL); + trace_xfs_gbmap_direct_endio(ip, offset, size, + ioend ? ioend->io_type : 0, NULL); + + if (!ioend) { + ASSERT(offset + size <= i_size_read(inode)); + return; + } if (XFS_FORCED_SHUTDOWN(mp)) goto out_end_io; @@ -1548,12 +1562,12 @@ xfs_end_io_direct_write( /* * The ioend only maps whole blocks, while the IO may be sector aligned. - * Hence the ioend offset/size may not match the IO offset/size exactly, - * but should span it completely. Write the IO sizes into the ioend so - * that completion processing does the right thing. + * Hence the ioend offset/size may not match the IO offset/size exactly. + * Because we don't map overwrites within EOF into the ioend, the offset + * may not match, but only if the endio spans EOF. Either way, write + * the IO sizes into the ioend so that completion processing does the + * right thing. */ - ASSERT(size <= ioend->io_size); - ASSERT(offset >= ioend->io_offset); ASSERT(offset + size <= ioend->io_offset + ioend->io_size); ioend->io_size = size; ioend->io_offset = offset; @@ -1562,20 +1576,15 @@ xfs_end_io_direct_write( * The ioend tells us whether we are doing unwritten extent conversion * or an append transaction that updates the on-disk file size. These * cases are the only cases where we should *potentially* be needing - * to update the VFS inode size. When the ioend indicates this, we - * are *guaranteed* to be running in non-interrupt context. + * to update the VFS inode size. * * We need to update the in-core inode size here so that we don't end up - * with the on-disk inode size being outside the in-core inode size. - * While we can do this in the process context after the IO has - * completed, this does not work for AIO and hence we always update - * the in-core inode size here if necessary. + * with the on-disk inode size being outside the in-core inode size. We + * have no other method of updating EOF for AIO, so always do it here + * if necessary. */ - if (ioend->io_type == XFS_IO_UNWRITTEN || xfs_ioend_is_append(ioend)) { - if (offset + size > i_size_read(inode)) - i_size_write(inode, offset + size); - } else - ASSERT(offset + size <= i_size_read(inode)); + if (offset + size > i_size_read(inode)) + i_size_write(inode, offset + size); /* * If we are doing an append IO that needs to update the EOF on disk, @@ -1584,7 +1593,7 @@ xfs_end_io_direct_write( * result in the ioend processing passing on the error if it is * possible as we can't return it from here. */ - if (ioend->io_type == XFS_IO_OVERWRITE && xfs_ioend_is_append(ioend)) + if (ioend->io_type == XFS_IO_OVERWRITE) ioend->io_error = xfs_setfilesize_trans_alloc(ioend); out_end_io: diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 0ae50e9847bb..4e0a5773eee4 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1220,6 +1220,7 @@ DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc); DEFINE_IOMAP_EVENT(xfs_gbmap_direct); DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new); DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_none); DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio); DECLARE_EVENT_CLASS(xfs_simple_io_class, -- cgit v1.2.3