diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-07-27 19:53:35 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-07-27 19:53:35 +0300 |
commit | 0e6acf0204da5b8705722a5f6806a4f55ed379d6 (patch) | |
tree | 4a8a9bf9daba9c734a0fdde417ae1cb472ca396d /fs/xfs/xfs_file.c | |
parent | 0e06f5c0deeef0332a5da2ecb8f1fcf3e024d958 (diff) | |
parent | f2bdfda9a1c668539bc85baf5625f6f14bc510b1 (diff) | |
download | linux-0e6acf0204da5b8705722a5f6806a4f55ed379d6.tar.xz |
Merge tag 'xfs-for-linus-4.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs
Pull xfs updates from Dave Chinner:
"The major addition is the new iomap based block mapping
infrastructure. We've been kicking this about locally for years, but
there are other filesystems want to use it too (e.g. gfs2). Now it
is fully working, reviewed and ready for merge and be used by other
filesystems.
There are a lot of other fixes and cleanups in the tree, but those are
XFS internal things and none are of the scale or visibility of the
iomap changes. See below for details.
I am likely to send another pull request next week - we're just about
ready to merge some new functionality (on disk block->owner reverse
mapping infrastructure), but that's a huge chunk of code (74 files
changed, 7283 insertions(+), 1114 deletions(-)) so I'm keeping that
separate to all the "normal" pull request changes so they don't get
lost in the noise.
Summary of changes in this update:
- generic iomap based IO path infrastructure
- generic iomap based fiemap implementation
- xfs iomap based Io path implementation
- buffer error handling fixes
- tracking of in flight buffer IO for unmount serialisation
- direct IO and DAX io path separation and simplification
- shortform directory format definition changes for wider platform
compatibility
- various buffer cache fixes
- cleanups in preparation for rmap merge
- error injection cleanups and fixes
- log item format buffer memory allocation restructuring to prevent
rare OOM reclaim deadlocks
- sparse inode chunks are now fully supported"
* tag 'xfs-for-linus-4.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs: (53 commits)
xfs: remove EXPERIMENTAL tag from sparse inode feature
xfs: bufferhead chains are invalid after end_page_writeback
xfs: allocate log vector buffers outside CIL context lock
libxfs: directory node splitting does not have an extra block
xfs: remove dax code from object file when disabled
xfs: skip dirty pages in ->releasepage()
xfs: remove __arch_pack
xfs: kill xfs_dir2_inou_t
xfs: kill xfs_dir2_sf_off_t
xfs: split direct I/O and DAX path
xfs: direct calls in the direct I/O path
xfs: stop using generic_file_read_iter for direct I/O
xfs: split xfs_file_read_iter into buffered and direct I/O helpers
xfs: remove s_maxbytes enforcement in xfs_file_read_iter
xfs: kill ioflags
xfs: don't pass ioflags around in the ioctl path
xfs: track and serialize in-flight async buffers against unmount
xfs: exclude never-released buffers from buftarg I/O accounting
xfs: don't reset b_retries to 0 on every failure
xfs: remove extraneous buffer flag changes
...
Diffstat (limited to 'fs/xfs/xfs_file.c')
-rw-r--r-- | fs/xfs/xfs_file.c | 425 |
1 files changed, 188 insertions, 237 deletions
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 1b3dc9dd8861..ed95e5bb04e6 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -37,6 +37,7 @@ #include "xfs_log.h" #include "xfs_icache.h" #include "xfs_pnfs.h" +#include "xfs_iomap.h" #include <linux/dcache.h> #include <linux/falloc.h> @@ -80,61 +81,17 @@ xfs_rw_ilock_demote( } /* - * xfs_iozero clears the specified range supplied via the page cache (except in - * the DAX case). Writes through the page cache will allocate blocks over holes, - * though the callers usually map the holes first and avoid them. If a block is - * not completely zeroed, then it will be read from disk before being partially - * zeroed. - * - * In the DAX case, we can just directly write to the underlying pages. This - * will not allocate blocks, but will avoid holes and unwritten extents and so - * not do unnecessary work. + * Clear the specified ranges to zero through either the pagecache or DAX. + * Holes and unwritten extents will be left as-is as they already are zeroed. */ int -xfs_iozero( - struct xfs_inode *ip, /* inode */ - loff_t pos, /* offset in file */ - size_t count) /* size of data to zero */ +xfs_zero_range( + struct xfs_inode *ip, + xfs_off_t pos, + xfs_off_t count, + bool *did_zero) { - struct page *page; - struct address_space *mapping; - int status = 0; - - - mapping = VFS_I(ip)->i_mapping; - do { - unsigned offset, bytes; - void *fsdata; - - offset = (pos & (PAGE_SIZE -1)); /* Within page */ - bytes = PAGE_SIZE - offset; - if (bytes > count) - bytes = count; - - if (IS_DAX(VFS_I(ip))) { - status = dax_zero_page_range(VFS_I(ip), pos, bytes, - xfs_get_blocks_direct); - if (status) - break; - } else { - status = pagecache_write_begin(NULL, mapping, pos, bytes, - AOP_FLAG_UNINTERRUPTIBLE, - &page, &fsdata); - if (status) - break; - - zero_user(page, offset, bytes); - - status = pagecache_write_end(NULL, mapping, pos, bytes, - bytes, page, fsdata); - WARN_ON(status <= 0); /* can't return less than zero! */ - status = 0; - } - pos += bytes; - count -= bytes; - } while (count); - - return status; + return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops); } int @@ -282,48 +239,35 @@ xfs_file_fsync( } STATIC ssize_t -xfs_file_read_iter( +xfs_file_dio_aio_read( struct kiocb *iocb, struct iov_iter *to) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - size_t size = iov_iter_count(to); + loff_t isize = i_size_read(inode); + size_t count = iov_iter_count(to); + struct iov_iter data; + struct xfs_buftarg *target; ssize_t ret = 0; - int ioflags = 0; - xfs_fsize_t n; - loff_t pos = iocb->ki_pos; - XFS_STATS_INC(mp, xs_read_calls); - - if (unlikely(iocb->ki_flags & IOCB_DIRECT)) - ioflags |= XFS_IO_ISDIRECT; - if (file->f_mode & FMODE_NOCMTIME) - ioflags |= XFS_IO_INVIS; - - if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) { - xfs_buftarg_t *target = - XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp; - /* DIO must be aligned to device logical sector size */ - if ((pos | size) & target->bt_logical_sectormask) { - if (pos == i_size_read(inode)) - return 0; - return -EINVAL; - } - } + trace_xfs_file_direct_read(ip, count, iocb->ki_pos); - n = mp->m_super->s_maxbytes - pos; - if (n <= 0 || size == 0) - return 0; + if (!count) + return 0; /* skip atime */ - if (n < size) - size = n; + if (XFS_IS_REALTIME_INODE(ip)) + target = ip->i_mount->m_rtdev_targp; + else + target = ip->i_mount->m_ddev_targp; - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; + /* DIO must be aligned to device logical sector size */ + if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { + if (iocb->ki_pos == isize) + return 0; + return -EINVAL; + } /* * Locking is a bit tricky here. If we take an exclusive lock for direct @@ -336,7 +280,7 @@ xfs_file_read_iter( * serialisation. */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) { + if (mapping->nrpages) { xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); @@ -351,8 +295,8 @@ xfs_file_read_iter( * flush and reduce the chances of repeated iolock cycles going * forward. */ - if (inode->i_mapping->nrpages) { - ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); + if (mapping->nrpages) { + ret = filemap_write_and_wait(mapping); if (ret) { xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); return ret; @@ -363,20 +307,95 @@ xfs_file_read_iter( * we fail to invalidate a page, but this should never * happen on XFS. Warn if it does fail. */ - ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping); + ret = invalidate_inode_pages2(mapping); WARN_ON_ONCE(ret); ret = 0; } xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); } - trace_xfs_file_read(ip, size, pos, ioflags); + data = *to; + ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, + xfs_get_blocks_direct, NULL, NULL, 0); + if (ret > 0) { + iocb->ki_pos += ret; + iov_iter_advance(to, ret); + } + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + file_accessed(iocb->ki_filp); + return ret; +} + +static noinline ssize_t +xfs_file_dax_read( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct iov_iter data = *to; + size_t count = iov_iter_count(to); + ssize_t ret = 0; + + trace_xfs_file_dax_read(ip, count, iocb->ki_pos); + + if (!count) + return 0; /* skip atime */ + + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0); + if (ret > 0) { + iocb->ki_pos += ret; + iov_iter_advance(to, ret); + } + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + + file_accessed(iocb->ki_filp); + return ret; +} + +STATIC ssize_t +xfs_file_buffered_aio_read( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); + ssize_t ret; + + trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos); + + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); ret = generic_file_read_iter(iocb, to); + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + + return ret; +} + +STATIC ssize_t +xfs_file_read_iter( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct xfs_mount *mp = XFS_I(inode)->i_mount; + ssize_t ret = 0; + + XFS_STATS_INC(mp, xs_read_calls); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + if (IS_DAX(inode)) + ret = xfs_file_dax_read(iocb, to); + else if (iocb->ki_flags & IOCB_DIRECT) + ret = xfs_file_dio_aio_read(iocb, to); + else + ret = xfs_file_buffered_aio_read(iocb, to); + if (ret > 0) XFS_STATS_ADD(mp, xs_read_bytes, ret); - - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } @@ -389,18 +408,14 @@ xfs_file_splice_read( unsigned int flags) { struct xfs_inode *ip = XFS_I(infilp->f_mapping->host); - int ioflags = 0; ssize_t ret; XFS_STATS_INC(ip->i_mount, xs_read_calls); - if (infilp->f_mode & FMODE_NOCMTIME) - ioflags |= XFS_IO_INVIS; - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - trace_xfs_file_splice_read(ip, count, *ppos, ioflags); + trace_xfs_file_splice_read(ip, count, *ppos); /* * DAX inodes cannot ues the page cache for splice, so we have to push @@ -424,49 +439,6 @@ out: } /* - * This routine is called to handle zeroing any space in the last block of the - * file that is beyond the EOF. We do this since the size is being increased - * without writing anything to that block and we don't want to read the - * garbage on the disk. - */ -STATIC int /* error (positive) */ -xfs_zero_last_block( - struct xfs_inode *ip, - xfs_fsize_t offset, - xfs_fsize_t isize, - bool *did_zeroing) -{ - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize); - int zero_offset = XFS_B_FSB_OFFSET(mp, isize); - int zero_len; - int nimaps = 1; - int error = 0; - struct xfs_bmbt_irec imap; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - return error; - - ASSERT(nimaps > 0); - - /* - * If the block underlying isize is just a hole, then there - * is nothing to zero. - */ - if (imap.br_startblock == HOLESTARTBLOCK) - return 0; - - zero_len = mp->m_sb.sb_blocksize - zero_offset; - if (isize + zero_len > offset) - zero_len = offset - isize; - *did_zeroing = true; - return xfs_iozero(ip, isize, zero_len); -} - -/* * Zero any on disk space between the current EOF and the new, larger EOF. * * This handles the normal case of zeroing the remainder of the last block in @@ -484,94 +456,11 @@ xfs_zero_eof( xfs_fsize_t isize, /* current inode size */ bool *did_zeroing) { - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t start_zero_fsb; - xfs_fileoff_t end_zero_fsb; - xfs_fileoff_t zero_count_fsb; - xfs_fileoff_t last_fsb; - xfs_fileoff_t zero_off; - xfs_fsize_t zero_len; - int nimaps; - int error = 0; - struct xfs_bmbt_irec imap; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(offset > isize); trace_xfs_zero_eof(ip, isize, offset - isize); - - /* - * First handle zeroing the block on which isize resides. - * - * We only zero a part of that block so it is handled specially. - */ - if (XFS_B_FSB_OFFSET(mp, isize) != 0) { - error = xfs_zero_last_block(ip, offset, isize, did_zeroing); - if (error) - return error; - } - - /* - * Calculate the range between the new size and the old where blocks - * needing to be zeroed may exist. - * - * To get the block where the last byte in the file currently resides, - * we need to subtract one from the size and truncate back to a block - * boundary. We subtract 1 in case the size is exactly on a block - * boundary. - */ - last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; - start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); - end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1); - ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb); - if (last_fsb == end_zero_fsb) { - /* - * The size was only incremented on its last block. - * We took care of that above, so just return. - */ - return 0; - } - - ASSERT(start_zero_fsb <= end_zero_fsb); - while (start_zero_fsb <= end_zero_fsb) { - nimaps = 1; - zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb, - &imap, &nimaps, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - return error; - - ASSERT(nimaps > 0); - - if (imap.br_state == XFS_EXT_UNWRITTEN || - imap.br_startblock == HOLESTARTBLOCK) { - start_zero_fsb = imap.br_startoff + imap.br_blockcount; - ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); - continue; - } - - /* - * There are blocks we need to zero. - */ - zero_off = XFS_FSB_TO_B(mp, start_zero_fsb); - zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount); - - if ((zero_off + zero_len) > offset) - zero_len = offset - zero_off; - - error = xfs_iozero(ip, zero_off, zero_len); - if (error) - return error; - - *did_zeroing = true; - start_zero_fsb = imap.br_startoff + imap.br_blockcount; - ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); - } - - return 0; + return xfs_zero_range(ip, isize, offset - isize, did_zeroing); } /* @@ -722,8 +611,7 @@ xfs_file_dio_aio_write( mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ - if (!IS_DAX(inode) && - ((iocb->ki_pos | count) & target->bt_logical_sectormask)) + if ((iocb->ki_pos | count) & target->bt_logical_sectormask) return -EINVAL; /* "unaligned" here means not aligned to a filesystem block */ @@ -762,7 +650,7 @@ xfs_file_dio_aio_write( end = iocb->ki_pos + count - 1; /* - * See xfs_file_read_iter() for why we do a full-file flush here. + * See xfs_file_dio_aio_read() for why we do a full-file flush here. */ if (mapping->nrpages) { ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); @@ -789,10 +677,12 @@ xfs_file_dio_aio_write( iolock = XFS_IOLOCK_SHARED; } - trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); + trace_xfs_file_direct_write(ip, count, iocb->ki_pos); data = *from; - ret = mapping->a_ops->direct_IO(iocb, &data); + ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, + xfs_get_blocks_direct, xfs_end_io_direct_write, + NULL, DIO_ASYNC_EXTEND); /* see generic_file_direct_write() for why this is necessary */ if (mapping->nrpages) { @@ -809,10 +699,70 @@ out: xfs_rw_iunlock(ip, iolock); /* - * No fallback to buffered IO on errors for XFS. DAX can result in - * partial writes, but direct IO will either complete fully or fail. + * No fallback to buffered IO on errors for XFS, direct IO will either + * complete fully or fail. */ - ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip))); + ASSERT(ret < 0 || ret == count); + return ret; +} + +static noinline ssize_t +xfs_file_dax_write( + struct kiocb *iocb, + struct iov_iter *from) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t ret = 0; + int unaligned_io = 0; + int iolock; + struct iov_iter data; + + /* "unaligned" here means not aligned to a filesystem block */ + if ((iocb->ki_pos & mp->m_blockmask) || + ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) { + unaligned_io = 1; + iolock = XFS_IOLOCK_EXCL; + } else if (mapping->nrpages) { + iolock = XFS_IOLOCK_EXCL; + } else { + iolock = XFS_IOLOCK_SHARED; + } + xfs_rw_ilock(ip, iolock); + + ret = xfs_file_aio_write_checks(iocb, from, &iolock); + if (ret) + goto out; + + /* + * Yes, even DAX files can have page cache attached to them: A zeroed + * page is inserted into the pagecache when we have to serve a write + * fault on a hole. It should never be dirtied and can simply be + * dropped from the pagecache once we get real data for the page. + */ + if (mapping->nrpages) { + ret = invalidate_inode_pages2(mapping); + WARN_ON_ONCE(ret); + } + + if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) { + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + iolock = XFS_IOLOCK_SHARED; + } + + trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos); + + data = *from; + ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, + xfs_end_io_direct_write, 0); + if (ret > 0) { + iocb->ki_pos += ret; + iov_iter_advance(from, ret); + } +out: + xfs_rw_iunlock(ip, iolock); return ret; } @@ -839,9 +789,8 @@ xfs_file_buffered_aio_write( current->backing_dev_info = inode_to_bdi(inode); write_retry: - trace_xfs_file_buffered_write(ip, iov_iter_count(from), - iocb->ki_pos, 0); - ret = generic_perform_write(file, from, iocb->ki_pos); + trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos); + ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops); if (likely(ret >= 0)) iocb->ki_pos += ret; @@ -895,7 +844,9 @@ xfs_file_write_iter( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode)) + if (IS_DAX(inode)) + ret = xfs_file_dax_write(iocb, from); + else if (iocb->ki_flags & IOCB_DIRECT) ret = xfs_file_dio_aio_write(iocb, from); else ret = xfs_file_buffered_aio_write(iocb, from); @@ -1553,7 +1504,7 @@ xfs_filemap_page_mkwrite( if (IS_DAX(inode)) { ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault); } else { - ret = block_page_mkwrite(vma, vmf, xfs_get_blocks); + ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops); ret = block_page_mkwrite_return(ret); } |