diff options
Diffstat (limited to 'fs/xfs/xfs_aops.c')
-rw-r--r-- | fs/xfs/xfs_aops.c | 1007 |
1 files changed, 271 insertions, 736 deletions
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 8eb3ba3d4d00..49f5f5896a43 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2016-2018 Christoph Hellwig. * All Rights Reserved. */ #include "xfs.h" @@ -20,9 +21,6 @@ #include "xfs_bmap_util.h" #include "xfs_bmap_btree.h" #include "xfs_reflink.h" -#include <linux/gfp.h> -#include <linux/mpage.h> -#include <linux/pagevec.h> #include <linux/writeback.h> /* @@ -30,31 +28,11 @@ */ struct xfs_writepage_ctx { struct xfs_bmbt_irec imap; - bool imap_valid; unsigned int io_type; + unsigned int cow_seq; struct xfs_ioend *ioend; - sector_t last_block; }; -void -xfs_count_page_state( - struct page *page, - int *delalloc, - int *unwritten) -{ - struct buffer_head *bh, *head; - - *delalloc = *unwritten = 0; - - bh = head = page_buffers(page); - do { - if (buffer_unwritten(bh)) - (*unwritten) = 1; - else if (buffer_delay(bh)) - (*delalloc) = 1; - } while ((bh = bh->b_this_page) != head); -} - struct block_device * xfs_find_bdev_for_inode( struct inode *inode) @@ -81,60 +59,23 @@ xfs_find_daxdev_for_inode( return mp->m_ddev_targp->bt_daxdev; } -/* - * We're now finished for good with this page. Update the page state via the - * associated buffer_heads, paying attention to the start and end offsets that - * we need to process on the page. - * - * Note that we open code the action in end_buffer_async_write here so that we - * only have to iterate over the buffers attached to the page once. This is not - * only more efficient, but also ensures that we only calls end_page_writeback - * at the end of the iteration, and thus avoids the pitfall of having the page - * and buffers potentially freed after every call to end_buffer_async_write. - */ static void xfs_finish_page_writeback( struct inode *inode, struct bio_vec *bvec, int error) { - struct buffer_head *head = page_buffers(bvec->bv_page), *bh = head; - bool busy = false; - unsigned int off = 0; - unsigned long flags; - - ASSERT(bvec->bv_offset < PAGE_SIZE); - ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0); - ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE); - ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0); - - local_irq_save(flags); - bit_spin_lock(BH_Uptodate_Lock, &head->b_state); - do { - if (off >= bvec->bv_offset && - off < bvec->bv_offset + bvec->bv_len) { - ASSERT(buffer_async_write(bh)); - ASSERT(bh->b_end_io == NULL); - - if (error) { - mark_buffer_write_io_error(bh); - clear_buffer_uptodate(bh); - SetPageError(bvec->bv_page); - } else { - set_buffer_uptodate(bh); - } - clear_buffer_async_write(bh); - unlock_buffer(bh); - } else if (buffer_async_write(bh)) { - ASSERT(buffer_locked(bh)); - busy = true; - } - off += bh->b_size; - } while ((bh = bh->b_this_page) != head); - bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); - local_irq_restore(flags); + struct iomap_page *iop = to_iomap_page(bvec->bv_page); + + if (error) { + SetPageError(bvec->bv_page); + mapping_set_error(inode->i_mapping, -EIO); + } + + ASSERT(iop || i_blocksize(inode) == PAGE_SIZE); + ASSERT(!iop || atomic_read(&iop->write_count) > 0); - if (!busy) + if (!iop || atomic_dec_and_test(&iop->write_count)) end_page_writeback(bvec->bv_page); } @@ -170,7 +111,6 @@ xfs_destroy_ioend( /* walk each page on bio, ending page IO on them */ bio_for_each_segment_all(bvec, bio, i) xfs_finish_page_writeback(inode, bvec, error); - bio_put(bio); } @@ -363,39 +303,66 @@ xfs_end_bio( STATIC int xfs_map_blocks( + struct xfs_writepage_ctx *wpc, struct inode *inode, - loff_t offset, - struct xfs_bmbt_irec *imap, - int type) + loff_t offset) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; ssize_t count = i_blocksize(inode); - xfs_fileoff_t offset_fsb, end_fsb; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb; + xfs_fileoff_t cow_fsb = NULLFILEOFF; + struct xfs_bmbt_irec imap; + int whichfork = XFS_DATA_FORK; + struct xfs_iext_cursor icur; + bool imap_valid; int error = 0; - int bmapi_flags = XFS_BMAPI_ENTIRE; - int nimaps = 1; - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; + /* + * We have to make sure the cached mapping is within EOF to protect + * against eofblocks trimming on file release leaving us with a stale + * mapping. Otherwise, a page for a subsequent file extending buffered + * write could get picked up by this writeback cycle and written to the + * wrong blocks. + * + * Note that what we really want here is a generic mapping invalidation + * mechanism to protect us from arbitrary extent modifying contexts, not + * just eofblocks. + */ + xfs_trim_extent_eof(&wpc->imap, ip); /* - * Truncate can race with writeback since writeback doesn't take the - * iolock and truncate decreases the file size before it starts - * truncating the pages between new_size and old_size. Therefore, we - * can end up in the situation where writeback gets a CoW fork mapping - * but the truncate makes the mapping invalid and we end up in here - * trying to get a new mapping. Bail out here so that we simply never - * get a valid mapping and so we drop the write altogether. The page - * truncation will kill the contents anyway. + * COW fork blocks can overlap data fork blocks even if the blocks + * aren't shared. COW I/O always takes precedent, so we must always + * check for overlap on reflink inodes unless the mapping is already a + * COW one, or the COW fork hasn't changed from the last time we looked + * at it. + * + * It's safe to check the COW fork if_seq here without the ILOCK because + * we've indirectly protected against concurrent updates: writeback has + * the page locked, which prevents concurrent invalidations by reflink + * and directio and prevents concurrent buffered writes to the same + * page. Changes to if_seq always happen under i_lock, which protects + * against concurrent updates and provides a memory barrier on the way + * out that ensures that we always see the current value. */ - if (type == XFS_IO_COW && offset > i_size_read(inode)) + imap_valid = offset_fsb >= wpc->imap.br_startoff && + offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount; + if (imap_valid && + (!xfs_inode_has_cow_data(ip) || + wpc->io_type == XFS_IO_COW || + wpc->cow_seq == READ_ONCE(ip->i_cowfp->if_seq))) return 0; - ASSERT(type != XFS_IO_COW); - if (type == XFS_IO_UNWRITTEN) - bmapi_flags |= XFS_BMAPI_IGSTATE; + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + /* + * If we don't have a valid map, now it's time to get a new one for this + * offset. This will convert delayed allocations (including COW ones) + * into real extents. If we return without a valid map, it means we + * landed in a hole and we skip the block. + */ xfs_ilock(ip, XFS_ILOCK_SHARED); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || (ip->i_df.if_flags & XFS_IFEXTENTS)); @@ -404,109 +371,96 @@ xfs_map_blocks( if (offset > mp->m_super->s_maxbytes - count) count = mp->m_super->s_maxbytes - offset; end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); - offset_fsb = XFS_B_TO_FSBT(mp, offset); - error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, - imap, &nimaps, bmapi_flags); + /* - * Truncate an overwrite extent if there's a pending CoW - * reservation before the end of this extent. This forces us - * to come back to writepage to take care of the CoW. + * Check if this is offset is covered by a COW extents, and if yes use + * it directly instead of looking up anything in the data fork. */ - if (nimaps && type == XFS_IO_OVERWRITE) - xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb, imap); - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - if (error) - return error; - - if (type == XFS_IO_DELALLOC && - (!nimaps || isnullstartblock(imap->br_startblock))) { - error = xfs_iomap_write_allocate(ip, XFS_DATA_FORK, offset, - imap); - if (!error) - trace_xfs_map_blocks_alloc(ip, offset, count, type, imap); - return error; - } - -#ifdef DEBUG - if (type == XFS_IO_UNWRITTEN) { - ASSERT(nimaps); - ASSERT(imap->br_startblock != HOLESTARTBLOCK); - ASSERT(imap->br_startblock != DELAYSTARTBLOCK); + if (xfs_inode_has_cow_data(ip) && + xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap)) + cow_fsb = imap.br_startoff; + if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) { + wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + /* + * Truncate can race with writeback since writeback doesn't + * take the iolock and truncate decreases the file size before + * it starts truncating the pages between new_size and old_size. + * Therefore, we can end up in the situation where writeback + * gets a CoW fork mapping but the truncate makes the mapping + * invalid and we end up in here trying to get a new mapping. + * bail out here so that we simply never get a valid mapping + * and so we drop the write altogether. The page truncation + * will kill the contents anyway. + */ + if (offset > i_size_read(inode)) { + wpc->io_type = XFS_IO_HOLE; + return 0; + } + whichfork = XFS_COW_FORK; + wpc->io_type = XFS_IO_COW; + goto allocate_blocks; } -#endif - if (nimaps) - trace_xfs_map_blocks_found(ip, offset, count, type, imap); - return 0; -} - -STATIC bool -xfs_imap_valid( - struct inode *inode, - struct xfs_bmbt_irec *imap, - xfs_off_t offset) -{ - offset >>= inode->i_blkbits; /* - * We have to make sure the cached mapping is within EOF to protect - * against eofblocks trimming on file release leaving us with a stale - * mapping. Otherwise, a page for a subsequent file extending buffered - * write could get picked up by this writeback cycle and written to the - * wrong blocks. - * - * Note that what we really want here is a generic mapping invalidation - * mechanism to protect us from arbitrary extent modifying contexts, not - * just eofblocks. + * Map valid and no COW extent in the way? We're done. */ - xfs_trim_extent_eof(imap, XFS_I(inode)); - - return offset >= imap->br_startoff && - offset < imap->br_startoff + imap->br_blockcount; -} - -STATIC void -xfs_start_buffer_writeback( - struct buffer_head *bh) -{ - ASSERT(buffer_mapped(bh)); - ASSERT(buffer_locked(bh)); - ASSERT(!buffer_delay(bh)); - ASSERT(!buffer_unwritten(bh)); - - bh->b_end_io = NULL; - set_buffer_async_write(bh); - set_buffer_uptodate(bh); - clear_buffer_dirty(bh); -} - -STATIC void -xfs_start_page_writeback( - struct page *page, - int clear_dirty) -{ - ASSERT(PageLocked(page)); - ASSERT(!PageWriteback(page)); + if (imap_valid) { + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return 0; + } /* - * if the page was not fully cleaned, we need to ensure that the higher - * layers come back to it correctly. That means we need to keep the page - * dirty, and for WB_SYNC_ALL writeback we need to ensure the - * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to - * write this page in this writeback sweep will be made. + * If we don't have a valid map, now it's time to get a new one for this + * offset. This will convert delayed allocations (including COW ones) + * into real extents. */ - if (clear_dirty) { - clear_page_dirty_for_io(page); - set_page_writeback(page); - } else - set_page_writeback_keepwrite(page); + if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) + imap.br_startoff = end_fsb; /* fake a hole past EOF */ + xfs_iunlock(ip, XFS_ILOCK_SHARED); - unlock_page(page); -} + if (imap.br_startoff > offset_fsb) { + /* landed in a hole or beyond EOF */ + imap.br_blockcount = imap.br_startoff - offset_fsb; + imap.br_startoff = offset_fsb; + imap.br_startblock = HOLESTARTBLOCK; + wpc->io_type = XFS_IO_HOLE; + } else { + /* + * Truncate to the next COW extent if there is one. This is the + * only opportunity to do this because we can skip COW fork + * lookups for the subsequent blocks in the mapping; however, + * the requirement to treat the COW range separately remains. + */ + if (cow_fsb != NULLFILEOFF && + cow_fsb < imap.br_startoff + imap.br_blockcount) + imap.br_blockcount = cow_fsb - imap.br_startoff; + + if (isnullstartblock(imap.br_startblock)) { + /* got a delalloc extent */ + wpc->io_type = XFS_IO_DELALLOC; + goto allocate_blocks; + } -static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh) -{ - return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); + if (imap.br_state == XFS_EXT_UNWRITTEN) + wpc->io_type = XFS_IO_UNWRITTEN; + else + wpc->io_type = XFS_IO_OVERWRITE; + } + + wpc->imap = imap; + trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap); + return 0; +allocate_blocks: + error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap, + &wpc->cow_seq); + if (error) + return error; + ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF || + imap.br_startoff + imap.br_blockcount <= cow_fsb); + wpc->imap = imap; + trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap); + return 0; } /* @@ -574,27 +528,20 @@ xfs_submit_ioend( return 0; } -static void -xfs_init_bio_from_bh( - struct bio *bio, - struct buffer_head *bh) -{ - bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio_set_dev(bio, bh->b_bdev); -} - static struct xfs_ioend * xfs_alloc_ioend( struct inode *inode, unsigned int type, xfs_off_t offset, - struct buffer_head *bh) + struct block_device *bdev, + sector_t sector) { struct xfs_ioend *ioend; struct bio *bio; bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &xfs_ioend_bioset); - xfs_init_bio_from_bh(bio, bh); + bio_set_dev(bio, bdev); + bio->bi_iter.bi_sector = sector; ioend = container_of(bio, struct xfs_ioend, io_inline_bio); INIT_LIST_HEAD(&ioend->io_list); @@ -619,13 +566,14 @@ static void xfs_chain_bio( struct xfs_ioend *ioend, struct writeback_control *wbc, - struct buffer_head *bh) + struct block_device *bdev, + sector_t sector) { struct bio *new; new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES); - xfs_init_bio_from_bh(new, bh); - + bio_set_dev(new, bdev); + new->bi_iter.bi_sector = sector; bio_chain(ioend->io_bio, new); bio_get(ioend->io_bio); /* for xfs_destroy_ioend */ ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); @@ -635,122 +583,47 @@ xfs_chain_bio( } /* - * Test to see if we've been building up a completion structure for - * earlier buffers -- if so, we try to append to this ioend if we - * can, otherwise we finish off any current ioend and start another. - * Return the ioend we finished off so that the caller can submit it - * once it has finished processing the dirty page. + * Test to see if we have an existing ioend structure that we could append to + * first, otherwise finish off the current ioend and start another. */ STATIC void xfs_add_to_ioend( struct inode *inode, - struct buffer_head *bh, xfs_off_t offset, + struct page *page, + struct iomap_page *iop, struct xfs_writepage_ctx *wpc, struct writeback_control *wbc, struct list_head *iolist) { + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct block_device *bdev = xfs_find_bdev_for_inode(inode); + unsigned len = i_blocksize(inode); + unsigned poff = offset & (PAGE_SIZE - 1); + sector_t sector; + + sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) + + ((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9); + if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type || - bh->b_blocknr != wpc->last_block + 1 || + sector != bio_end_sector(wpc->ioend->io_bio) || offset != wpc->ioend->io_offset + wpc->ioend->io_size) { if (wpc->ioend) list_add(&wpc->ioend->io_list, iolist); - wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, bh); + wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, + bdev, sector); } - /* - * If the buffer doesn't fit into the bio we need to allocate a new - * one. This shouldn't happen more than once for a given buffer. - */ - while (xfs_bio_add_buffer(wpc->ioend->io_bio, bh) != bh->b_size) - xfs_chain_bio(wpc->ioend, wbc, bh); - - wpc->ioend->io_size += bh->b_size; - wpc->last_block = bh->b_blocknr; - xfs_start_buffer_writeback(bh); -} - -STATIC void -xfs_map_buffer( - struct inode *inode, - struct buffer_head *bh, - struct xfs_bmbt_irec *imap, - xfs_off_t offset) -{ - sector_t bn; - struct xfs_mount *m = XFS_I(inode)->i_mount; - xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff); - xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock); - - ASSERT(imap->br_startblock != HOLESTARTBLOCK); - ASSERT(imap->br_startblock != DELAYSTARTBLOCK); - - bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) + - ((offset - iomap_offset) >> inode->i_blkbits); - - ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode))); - - bh->b_blocknr = bn; - set_buffer_mapped(bh); -} - -STATIC void -xfs_map_at_offset( - struct inode *inode, - struct buffer_head *bh, - struct xfs_bmbt_irec *imap, - xfs_off_t offset) -{ - ASSERT(imap->br_startblock != HOLESTARTBLOCK); - ASSERT(imap->br_startblock != DELAYSTARTBLOCK); - - xfs_map_buffer(inode, bh, imap, offset); - set_buffer_mapped(bh); - clear_buffer_delay(bh); - clear_buffer_unwritten(bh); -} - -/* - * Test if a given page contains at least one buffer of a given @type. - * If @check_all_buffers is true, then we walk all the buffers in the page to - * try to find one of the type passed in. If it is not set, then the caller only - * needs to check the first buffer on the page for a match. - */ -STATIC bool -xfs_check_page_type( - struct page *page, - unsigned int type, - bool check_all_buffers) -{ - struct buffer_head *bh; - struct buffer_head *head; - - if (PageWriteback(page)) - return false; - if (!page->mapping) - return false; - if (!page_has_buffers(page)) - return false; - - bh = head = page_buffers(page); - do { - if (buffer_unwritten(bh)) { - if (type == XFS_IO_UNWRITTEN) - return true; - } else if (buffer_delay(bh)) { - if (type == XFS_IO_DELALLOC) - return true; - } else if (buffer_dirty(bh) && buffer_mapped(bh)) { - if (type == XFS_IO_OVERWRITE) - return true; - } - - /* If we are only checking the first buffer, we are done now. */ - if (!check_all_buffers) - break; - } while ((bh = bh->b_this_page) != head); + if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) { + if (iop) + atomic_inc(&iop->write_count); + if (bio_full(wpc->ioend->io_bio)) + xfs_chain_bio(wpc->ioend, wbc, bdev, sector); + __bio_add_page(wpc->ioend->io_bio, page, len, poff); + } - return false; + wpc->ioend->io_size += len; } STATIC void @@ -759,34 +632,20 @@ xfs_vm_invalidatepage( unsigned int offset, unsigned int length) { - trace_xfs_invalidatepage(page->mapping->host, page, offset, - length); - - /* - * If we are invalidating the entire page, clear the dirty state from it - * so that we can check for attempts to release dirty cached pages in - * xfs_vm_releasepage(). - */ - if (offset == 0 && length >= PAGE_SIZE) - cancel_dirty_page(page); - block_invalidatepage(page, offset, length); + trace_xfs_invalidatepage(page->mapping->host, page, offset, length); + iomap_invalidatepage(page, offset, length); } /* - * If the page has delalloc buffers on it, we need to punch them out before we - * invalidate the page. If we don't, we leave a stale delalloc mapping on the - * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read - * is done on that same region - the delalloc extent is returned when none is - * supposed to be there. - * - * We prevent this by truncating away the delalloc regions on the page before - * invalidating it. Because they are delalloc, we can do this without needing a - * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this - * truncation without a transaction as there is no space left for block - * reservation (typically why we see a ENOSPC in writeback). + * If the page has delalloc blocks on it, we need to punch them out before we + * invalidate the page. If we don't, we leave a stale delalloc mapping on the + * inode that can trip up a later direct I/O read operation on the same region. * - * This is not a performance critical path, so for now just do the punching a - * buffer head at a time. + * We prevent this by truncating away the delalloc regions on the page. Because + * they are delalloc, we can do this without needing a transaction. Indeed - if + * we get ENOSPC errors, we have to be able to do this truncation without a + * transaction as there is no space left for block reservation (typically why we + * see a ENOSPC in writeback). */ STATIC void xfs_aops_discard_page( @@ -794,104 +653,31 @@ xfs_aops_discard_page( { struct inode *inode = page->mapping->host; struct xfs_inode *ip = XFS_I(inode); - struct buffer_head *bh, *head; + struct xfs_mount *mp = ip->i_mount; loff_t offset = page_offset(page); + xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, offset); + int error; - if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true)) - goto out_invalidate; - - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + if (XFS_FORCED_SHUTDOWN(mp)) goto out_invalidate; - xfs_alert(ip->i_mount, + xfs_alert(mp, "page discard on page "PTR_FMT", inode 0x%llx, offset %llu.", page, ip->i_ino, offset); - xfs_ilock(ip, XFS_ILOCK_EXCL); - bh = head = page_buffers(page); - do { - int error; - xfs_fileoff_t start_fsb; - - if (!buffer_delay(bh)) - goto next_buffer; - - start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); - error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1); - if (error) { - /* something screwed, just bail */ - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_alert(ip->i_mount, - "page discard unable to remove delalloc mapping."); - } - break; - } -next_buffer: - offset += i_blocksize(inode); - - } while ((bh = bh->b_this_page) != head); - - xfs_iunlock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_punch_delalloc_range(ip, start_fsb, + PAGE_SIZE / i_blocksize(inode)); + if (error && !XFS_FORCED_SHUTDOWN(mp)) + xfs_alert(mp, "page discard unable to remove delalloc mapping."); out_invalidate: xfs_vm_invalidatepage(page, 0, PAGE_SIZE); - return; -} - -static int -xfs_map_cow( - struct xfs_writepage_ctx *wpc, - struct inode *inode, - loff_t offset, - unsigned int *new_type) -{ - struct xfs_inode *ip = XFS_I(inode); - struct xfs_bmbt_irec imap; - bool is_cow = false; - int error; - - /* - * If we already have a valid COW mapping keep using it. - */ - if (wpc->io_type == XFS_IO_COW) { - wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset); - if (wpc->imap_valid) { - *new_type = XFS_IO_COW; - return 0; - } - } - - /* - * Else we need to check if there is a COW mapping at this offset. - */ - xfs_ilock(ip, XFS_ILOCK_SHARED); - is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap); - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - if (!is_cow) - return 0; - - /* - * And if the COW mapping has a delayed extent here we need to - * allocate real space for it now. - */ - if (isnullstartblock(imap.br_startblock)) { - error = xfs_iomap_write_allocate(ip, XFS_COW_FORK, offset, - &imap); - if (error) - return error; - } - - wpc->io_type = *new_type = XFS_IO_COW; - wpc->imap_valid = true; - wpc->imap = imap; - return 0; } /* * We implement an immediate ioend submission policy here to avoid needing to * chain multiple ioends and hence nest mempool allocations which can violate * forward progress guarantees we need to provide. The current ioend we are - * adding buffers to is cached on the writepage context, and if the new buffer + * adding blocks to is cached on the writepage context, and if the new block * does not append to the cached ioend it will create a new ioend and cache that * instead. * @@ -912,138 +698,99 @@ xfs_writepage_map( uint64_t end_offset) { LIST_HEAD(submit_list); + struct iomap_page *iop = to_iomap_page(page); + unsigned len = i_blocksize(inode); struct xfs_ioend *ioend, *next; - struct buffer_head *bh, *head; - ssize_t len = i_blocksize(inode); - uint64_t offset; - int error = 0; - int count = 0; - int uptodate = 1; - unsigned int new_type; - - bh = head = page_buffers(page); - offset = page_offset(page); - do { - if (offset >= end_offset) - break; - if (!buffer_uptodate(bh)) - uptodate = 0; + uint64_t file_offset; /* file offset of page */ + int error = 0, count = 0, i; - /* - * set_page_dirty dirties all buffers in a page, independent - * of their state. The dirty state however is entirely - * meaningless for holes (!mapped && uptodate), so skip - * buffers covering holes here. - */ - if (!buffer_mapped(bh) && buffer_uptodate(bh)) { - wpc->imap_valid = false; - continue; - } + ASSERT(iop || i_blocksize(inode) == PAGE_SIZE); + ASSERT(!iop || atomic_read(&iop->write_count) == 0); - if (buffer_unwritten(bh)) - new_type = XFS_IO_UNWRITTEN; - else if (buffer_delay(bh)) - new_type = XFS_IO_DELALLOC; - else if (buffer_uptodate(bh)) - new_type = XFS_IO_OVERWRITE; - else { - if (PageUptodate(page)) - ASSERT(buffer_mapped(bh)); - /* - * This buffer is not uptodate and will not be - * written to disk. Ensure that we will put any - * subsequent writeable buffers into a new - * ioend. - */ - wpc->imap_valid = false; + /* + * Walk through the page to find areas to write back. If we run off the + * end of the current map or find the current map invalid, grab a new + * one. + */ + for (i = 0, file_offset = page_offset(page); + i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset; + i++, file_offset += len) { + if (iop && !test_bit(i, iop->uptodate)) continue; - } - if (xfs_is_reflink_inode(XFS_I(inode))) { - error = xfs_map_cow(wpc, inode, offset, &new_type); - if (error) - goto out; - } - - if (wpc->io_type != new_type) { - wpc->io_type = new_type; - wpc->imap_valid = false; - } - - if (wpc->imap_valid) - wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, - offset); - if (!wpc->imap_valid) { - error = xfs_map_blocks(inode, offset, &wpc->imap, - wpc->io_type); - if (error) - goto out; - wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, - offset); - } - if (wpc->imap_valid) { - lock_buffer(bh); - if (wpc->io_type != XFS_IO_OVERWRITE) - xfs_map_at_offset(inode, bh, &wpc->imap, offset); - xfs_add_to_ioend(inode, bh, offset, wpc, wbc, &submit_list); - count++; - } - - } while (offset += len, ((bh = bh->b_this_page) != head)); - - if (uptodate && bh == head) - SetPageUptodate(page); + error = xfs_map_blocks(wpc, inode, file_offset); + if (error) + break; + if (wpc->io_type == XFS_IO_HOLE) + continue; + xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc, + &submit_list); + count++; + } ASSERT(wpc->ioend || list_empty(&submit_list)); + ASSERT(PageLocked(page)); + ASSERT(!PageWriteback(page)); -out: /* - * On error, we have to fail the ioend here because we have locked - * buffers in the ioend. If we don't do this, we'll deadlock - * invalidating the page as that tries to lock the buffers on the page. - * Also, because we may have set pages under writeback, we have to make - * sure we run IO completion to mark the error state of the IO - * appropriately, so we can't cancel the ioend directly here. That means - * we have to mark this page as under writeback if we included any - * buffers from it in the ioend chain so that completion treats it - * correctly. + * On error, we have to fail the ioend here because we may have set + * pages under writeback, we have to make sure we run IO completion to + * mark the error state of the IO appropriately, so we can't cancel the + * ioend directly here. That means we have to mark this page as under + * writeback if we included any blocks from it in the ioend chain so + * that completion treats it correctly. * * If we didn't include the page in the ioend, the on error we can * simply discard and unlock it as there are no other users of the page - * or it's buffers right now. The caller will still need to trigger - * submission of outstanding ioends on the writepage context so they are - * treated correctly on error. + * now. The caller will still need to trigger submission of outstanding + * ioends on the writepage context so they are treated correctly on + * error. */ - if (count) { - xfs_start_page_writeback(page, !error); + if (unlikely(error)) { + if (!count) { + xfs_aops_discard_page(page); + ClearPageUptodate(page); + unlock_page(page); + goto done; + } /* - * Preserve the original error if there was one, otherwise catch - * submission errors here and propagate into subsequent ioend - * submissions. + * If the page was not fully cleaned, we need to ensure that the + * higher layers come back to it correctly. That means we need + * to keep the page dirty, and for WB_SYNC_ALL writeback we need + * to ensure the PAGECACHE_TAG_TOWRITE index mark is not removed + * so another attempt to write this page in this writeback sweep + * will be made. */ - list_for_each_entry_safe(ioend, next, &submit_list, io_list) { - int error2; - - list_del_init(&ioend->io_list); - error2 = xfs_submit_ioend(wbc, ioend, error); - if (error2 && !error) - error = error2; - } - } else if (error) { - xfs_aops_discard_page(page); - ClearPageUptodate(page); - unlock_page(page); + set_page_writeback_keepwrite(page); } else { - /* - * We can end up here with no error and nothing to write if we - * race with a partial page truncate on a sub-page block sized - * filesystem. In that case we need to mark the page clean. - */ - xfs_start_page_writeback(page, 1); - end_page_writeback(page); + clear_page_dirty_for_io(page); + set_page_writeback(page); } + unlock_page(page); + + /* + * Preserve the original error if there was one, otherwise catch + * submission errors here and propagate into subsequent ioend + * submissions. + */ + list_for_each_entry_safe(ioend, next, &submit_list, io_list) { + int error2; + + list_del_init(&ioend->io_list); + error2 = xfs_submit_ioend(wbc, ioend, error); + if (error2 && !error) + error = error2; + } + + /* + * We can end up here with no error and nothing to write only if we race + * with a partial page truncate on a sub-page block sized filesystem. + */ + if (!count) + end_page_writeback(page); +done: mapping_set_error(page->mapping, error); return error; } @@ -1054,7 +801,6 @@ out: * For delalloc space on the page we need to allocate space and flush it. * For unwritten space on the page we need to start the conversion to * regular allocated space. - * For any other dirty buffer heads on the page we should flush them. */ STATIC int xfs_do_writepage( @@ -1070,8 +816,6 @@ xfs_do_writepage( trace_xfs_writepage(inode, page, 0, 0); - ASSERT(page_has_buffers(page)); - /* * Refuse to write the page out if we are called from reclaim context. * @@ -1210,166 +954,13 @@ xfs_dax_writepages( xfs_find_bdev_for_inode(mapping->host), wbc); } -/* - * Called to move a page into cleanable state - and from there - * to be released. The page should already be clean. We always - * have buffer heads in this call. - * - * Returns 1 if the page is ok to release, 0 otherwise. - */ STATIC int xfs_vm_releasepage( struct page *page, gfp_t gfp_mask) { - int delalloc, unwritten; - trace_xfs_releasepage(page->mapping->host, page, 0, 0); - - /* - * mm accommodates an old ext3 case where clean pages might not have had - * the dirty bit cleared. Thus, it can send actual dirty pages to - * ->releasepage() via shrink_active_list(). Conversely, - * block_invalidatepage() can send pages that are still marked dirty but - * otherwise have invalidated buffers. - * - * We want to release the latter to avoid unnecessary buildup of the - * LRU, so xfs_vm_invalidatepage() clears the page dirty flag on pages - * that are entirely invalidated and need to be released. Hence the - * only time we should get dirty pages here is through - * shrink_active_list() and so we can simply skip those now. - * - * warn if we've left any lingering delalloc/unwritten buffers on clean - * or invalidated pages we are about to release. - */ - if (PageDirty(page)) - return 0; - - xfs_count_page_state(page, &delalloc, &unwritten); - - if (WARN_ON_ONCE(delalloc)) - return 0; - if (WARN_ON_ONCE(unwritten)) - return 0; - - return try_to_free_buffers(page); -} - -/* - * If this is O_DIRECT or the mpage code calling tell them how large the mapping - * is, so that we can avoid repeated get_blocks calls. - * - * If the mapping spans EOF, then we have to break the mapping up as the mapping - * for blocks beyond EOF must be marked new so that sub block regions can be - * correctly zeroed. We can't do this for mappings within EOF unless the mapping - * was just allocated or is unwritten, otherwise the callers would overwrite - * existing data with zeros. Hence we have to split the mapping into a range up - * to and including EOF, and a second mapping for beyond EOF. - */ -static void -xfs_map_trim_size( - struct inode *inode, - sector_t iblock, - struct buffer_head *bh_result, - struct xfs_bmbt_irec *imap, - xfs_off_t offset, - ssize_t size) -{ - xfs_off_t mapping_size; - - mapping_size = imap->br_startoff + imap->br_blockcount - iblock; - mapping_size <<= inode->i_blkbits; - - ASSERT(mapping_size > 0); - if (mapping_size > size) - mapping_size = size; - if (offset < i_size_read(inode) && - (xfs_ufsize_t)offset + mapping_size >= i_size_read(inode)) { - /* limit mapping to block that spans EOF */ - mapping_size = roundup_64(i_size_read(inode) - offset, - i_blocksize(inode)); - } - if (mapping_size > LONG_MAX) - mapping_size = LONG_MAX; - - bh_result->b_size = mapping_size; -} - -static int -xfs_get_blocks( - struct inode *inode, - sector_t iblock, - struct buffer_head *bh_result, - int create) -{ - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t offset_fsb, end_fsb; - int error = 0; - int lockmode = 0; - struct xfs_bmbt_irec imap; - int nimaps = 1; - xfs_off_t offset; - ssize_t size; - - BUG_ON(create); - - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - - offset = (xfs_off_t)iblock << inode->i_blkbits; - ASSERT(bh_result->b_size >= i_blocksize(inode)); - size = bh_result->b_size; - - if (offset >= i_size_read(inode)) - return 0; - - /* - * Direct I/O is usually done on preallocated files, so try getting - * a block mapping without an exclusive lock first. - */ - lockmode = xfs_ilock_data_map_shared(ip); - - ASSERT(offset <= mp->m_super->s_maxbytes); - if (offset > mp->m_super->s_maxbytes - size) - size = mp->m_super->s_maxbytes - offset; - end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); - offset_fsb = XFS_B_TO_FSBT(mp, offset); - - error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, - &nimaps, 0); - if (error) - goto out_unlock; - if (!nimaps) { - trace_xfs_get_blocks_notfound(ip, offset, size); - goto out_unlock; - } - - trace_xfs_get_blocks_found(ip, offset, size, - imap.br_state == XFS_EXT_UNWRITTEN ? - XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, &imap); - xfs_iunlock(ip, lockmode); - - /* trim mapping down to size requested */ - xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size); - - /* - * For unwritten extents do not report a disk address in the buffered - * read case (treat as if we're reading into a hole). - */ - if (xfs_bmap_is_real_extent(&imap)) - xfs_map_buffer(inode, bh_result, &imap, offset); - - /* - * If this is a realtime file, data may be on a different device. - * to that pointed to from the buffer_head b_bdev currently. - */ - bh_result->b_bdev = xfs_find_bdev_for_inode(inode); - return 0; - -out_unlock: - xfs_iunlock(ip, lockmode); - return error; + return iomap_releasepage(page, gfp_mask); } STATIC sector_t @@ -1401,7 +992,7 @@ xfs_vm_readpage( struct page *page) { trace_xfs_vm_readpage(page->mapping->host, 1); - return mpage_readpage(page, xfs_get_blocks); + return iomap_readpage(page, &xfs_iomap_ops); } STATIC int @@ -1412,63 +1003,7 @@ xfs_vm_readpages( unsigned nr_pages) { trace_xfs_vm_readpages(mapping->host, nr_pages); - return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); -} - -/* - * This is basically a copy of __set_page_dirty_buffers() with one - * small tweak: buffers beyond EOF do not get marked dirty. If we mark them - * dirty, we'll never be able to clean them because we don't write buffers - * beyond EOF, and that means we can't invalidate pages that span EOF - * that have been marked dirty. Further, the dirty state can leak into - * the file interior if the file is extended, resulting in all sorts of - * bad things happening as the state does not match the underlying data. - * - * XXX: this really indicates that bufferheads in XFS need to die. Warts like - * this only exist because of bufferheads and how the generic code manages them. - */ -STATIC int -xfs_vm_set_page_dirty( - struct page *page) -{ - struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; - loff_t end_offset; - loff_t offset; - int newly_dirty; - - if (unlikely(!mapping)) - return !TestSetPageDirty(page); - - end_offset = i_size_read(inode); - offset = page_offset(page); - - spin_lock(&mapping->private_lock); - if (page_has_buffers(page)) { - struct buffer_head *head = page_buffers(page); - struct buffer_head *bh = head; - - do { - if (offset < end_offset) - set_buffer_dirty(bh); - bh = bh->b_this_page; - offset += i_blocksize(inode); - } while (bh != head); - } - /* - * Lock out page->mem_cgroup migration to keep PageDirty - * synchronized with per-memcg dirty page counters. - */ - lock_page_memcg(page); - newly_dirty = !TestSetPageDirty(page); - spin_unlock(&mapping->private_lock); - - if (newly_dirty) - __set_page_dirty(page, mapping, 1); - unlock_page_memcg(page); - if (newly_dirty) - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - return newly_dirty; + return iomap_readpages(mapping, pages, nr_pages, &xfs_iomap_ops); } static int @@ -1486,13 +1021,13 @@ const struct address_space_operations xfs_address_space_operations = { .readpages = xfs_vm_readpages, .writepage = xfs_vm_writepage, .writepages = xfs_vm_writepages, - .set_page_dirty = xfs_vm_set_page_dirty, + .set_page_dirty = iomap_set_page_dirty, .releasepage = xfs_vm_releasepage, .invalidatepage = xfs_vm_invalidatepage, .bmap = xfs_vm_bmap, .direct_IO = noop_direct_IO, - .migratepage = buffer_migrate_page, - .is_partially_uptodate = block_is_partially_uptodate, + .migratepage = iomap_migrate_page, + .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_page = generic_error_remove_page, .swap_activate = xfs_iomap_swapfile_activate, }; |