From 705965bd6dfadc3b2e0241da1423ef660bdd04c8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 8 Mar 2016 23:08:10 -0500 Subject: ext4: rename and split get blocks functions Rename ext4_get_blocks_write() to ext4_get_blocks_unwritten() to better describe what it does. Also split out get blocks functions for direct IO. Later we move functionality from _ext4_get_blocks() there. There's no functional change in this patch. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 96 +++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 60 insertions(+), 36 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9cc57c3b4661..bf545d017210 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -768,6 +768,60 @@ int ext4_get_block(struct inode *inode, sector_t iblock, create ? EXT4_GET_BLOCKS_CREATE : 0); } +/* + * Get block function used when preparing for buffered write if we require + * creating an unwritten extent if blocks haven't been allocated. The extent + * will be converted to written after the IO is complete. + */ +int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n", + inode->i_ino, create); + return _ext4_get_block(inode, iblock, bh_result, + EXT4_GET_BLOCKS_IO_CREATE_EXT); +} + +/* Get block function for DIO reads and writes to inodes without extents */ +int ext4_dio_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + return _ext4_get_block(inode, iblock, bh, + create ? EXT4_GET_BLOCKS_CREATE : 0); +} + +/* + * Get block function for DIO writes when we create unwritten extent if + * blocks are not allocated yet. The extent will be converted to written + * after IO is complete. + */ +static int ext4_dio_get_block_unwritten(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + ext4_debug("ext4_dio_get_block_unwritten: inode %lu, create flag %d\n", + inode->i_ino, create); + return _ext4_get_block(inode, iblock, bh_result, + EXT4_GET_BLOCKS_IO_CREATE_EXT); +} + +static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret; + + ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n", + inode->i_ino, create); + ret = _ext4_get_block(inode, iblock, bh_result, 0); + /* + * Blocks should have been preallocated! ext4_file_write_iter() checks + * that. + */ + WARN_ON_ONCE(!buffer_mapped(bh_result)); + + return ret; +} + + /* * `handle' can be NULL if create is zero */ @@ -1079,13 +1133,14 @@ retry_journal: #ifdef CONFIG_EXT4_FS_ENCRYPTION if (ext4_should_dioread_nolock(inode)) ret = ext4_block_write_begin(page, pos, len, - ext4_get_block_write); + ext4_get_block_unwritten); else ret = ext4_block_write_begin(page, pos, len, ext4_get_block); #else if (ext4_should_dioread_nolock(inode)) - ret = __block_write_begin(page, pos, len, ext4_get_block_write); + ret = __block_write_begin(page, pos, len, + ext4_get_block_unwritten); else ret = __block_write_begin(page, pos, len, ext4_get_block); #endif @@ -3084,37 +3139,6 @@ static int ext4_releasepage(struct page *page, gfp_t wait) return try_to_free_buffers(page); } -/* - * ext4_get_block used when preparing for a DIO write or buffer write. - * We allocate an uinitialized extent if blocks haven't been allocated. - * The extent will be converted to initialized after the IO is complete. - */ -int ext4_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", - inode->i_ino, create); - return _ext4_get_block(inode, iblock, bh_result, - EXT4_GET_BLOCKS_IO_CREATE_EXT); -} - -static int ext4_get_block_overwrite(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - int ret; - - ext4_debug("ext4_get_block_overwrite: inode %lu, create flag %d\n", - inode->i_ino, create); - ret = _ext4_get_block(inode, iblock, bh_result, 0); - /* - * Blocks should have been preallocated! ext4_file_write_iter() checks - * that. - */ - WARN_ON_ONCE(!buffer_mapped(bh_result)); - - return ret; -} - #ifdef CONFIG_FS_DAX int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) @@ -3282,7 +3306,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, */ iocb->private = NULL; if (overwrite) { - get_block_func = ext4_get_block_overwrite; + get_block_func = ext4_dio_get_block_overwrite; } else { ext4_inode_aio_set(inode, NULL); if (!is_sync_kiocb(iocb)) { @@ -3304,7 +3328,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, */ ext4_inode_aio_set(inode, io_end); } - get_block_func = ext4_get_block_write; + get_block_func = ext4_dio_get_block_unwritten; dio_flags = DIO_LOCKING; } #ifdef CONFIG_EXT4_FS_ENCRYPTION @@ -5498,7 +5522,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) unlock_page(page); /* OK, we need to fill the hole... */ if (ext4_should_dioread_nolock(inode)) - get_block = ext4_get_block_write; + get_block = ext4_get_block_unwritten; else get_block = ext4_get_block; retry_alloc: -- cgit v1.2.3 From efe70c29511544b0468723fe92c1847b3b0ca046 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 8 Mar 2016 23:35:46 -0500 Subject: ext4: move trans handling and completion deferal out of _ext4_get_block There is no need to handle starting of a transaction and deferal of DIO completion in _ext4_get_block() function. We can move this out to get block functions for direct IO that need it. That way we can add stricter checks verifying things work as we expect. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 91 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 59 insertions(+), 32 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bf545d017210..aea67d906cd8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -714,16 +714,11 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) cmpxchg(&bh->b_state, old_state, new_state) != old_state)); } -/* Maximum number of blocks we map for direct IO at once. */ -#define DIO_MAX_BLOCKS 4096 - static int _ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int flags) { - handle_t *handle = ext4_journal_current_handle(); struct ext4_map_blocks map; - int ret = 0, started = 0; - int dio_credits; + int ret = 0; if (ext4_has_inline_data(inode)) return -ERANGE; @@ -731,33 +726,14 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; - if (flags && !handle) { - /* Direct IO write... */ - if (map.m_len > DIO_MAX_BLOCKS) - map.m_len = DIO_MAX_BLOCKS; - dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, - dio_credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - return ret; - } - started = 1; - } - - ret = ext4_map_blocks(handle, inode, &map, flags); + ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map, + flags); if (ret > 0) { - ext4_io_end_t *io_end = ext4_inode_aio(inode); - map_bh(bh, inode->i_sb, map.m_pblk); ext4_update_bh_state(bh, map.m_flags); - if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) - set_buffer_defer_completion(bh); bh->b_size = inode->i_sb->s_blocksize * map.m_len; ret = 0; } - if (started) - ext4_journal_stop(handle); return ret; } @@ -782,12 +758,42 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, EXT4_GET_BLOCKS_IO_CREATE_EXT); } +/* Maximum number of blocks we map for direct IO at once. */ +#define DIO_MAX_BLOCKS 4096 + +static handle_t *start_dio_trans(struct inode *inode, + struct buffer_head *bh_result) +{ + int dio_credits; + + /* Trim mapping request to maximum we can map at once for DIO */ + if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS) + bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits; + dio_credits = ext4_chunk_trans_blocks(inode, + bh_result->b_size >> inode->i_blkbits); + return ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); +} + /* Get block function for DIO reads and writes to inodes without extents */ int ext4_dio_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { - return _ext4_get_block(inode, iblock, bh, - create ? EXT4_GET_BLOCKS_CREATE : 0); + handle_t *handle; + int ret; + + /* We don't expect handle for direct IO */ + WARN_ON_ONCE(ext4_journal_current_handle()); + + if (create) { + handle = start_dio_trans(inode, bh); + if (IS_ERR(handle)) + return PTR_ERR(handle); + } + ret = _ext4_get_block(inode, iblock, bh, + create ? EXT4_GET_BLOCKS_CREATE : 0); + if (create) + ext4_journal_stop(handle); + return ret; } /* @@ -798,10 +804,28 @@ int ext4_dio_get_block(struct inode *inode, sector_t iblock, static int ext4_dio_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { + handle_t *handle; + int ret; + ext4_debug("ext4_dio_get_block_unwritten: inode %lu, create flag %d\n", inode->i_ino, create); - return _ext4_get_block(inode, iblock, bh_result, - EXT4_GET_BLOCKS_IO_CREATE_EXT); + /* We don't expect handle for direct IO */ + WARN_ON_ONCE(ext4_journal_current_handle()); + + handle = start_dio_trans(inode, bh_result); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = _ext4_get_block(inode, iblock, bh_result, + EXT4_GET_BLOCKS_IO_CREATE_EXT); + ext4_journal_stop(handle); + if (!ret && buffer_unwritten(bh_result)) { + ext4_io_end_t *io_end = ext4_inode_aio(inode); + + set_buffer_defer_completion(bh_result); + WARN_ON_ONCE(io_end && !(io_end->flag & EXT4_IO_END_UNWRITTEN)); + } + + return ret; } static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock, @@ -811,12 +835,15 @@ static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock, ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n", inode->i_ino, create); + /* We don't expect handle for direct IO */ + WARN_ON_ONCE(ext4_journal_current_handle()); + ret = _ext4_get_block(inode, iblock, bh_result, 0); /* * Blocks should have been preallocated! ext4_file_write_iter() checks * that. */ - WARN_ON_ONCE(!buffer_mapped(bh_result)); + WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result)); return ret; } -- cgit v1.2.3 From 109811c20fb8ec46e2ed01750214a32a9163d164 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 8 Mar 2016 23:36:46 -0500 Subject: ext4: simplify io_end handling for AIO DIO When mapping blocks for direct IO, we allocate io_end structure before mapping blocks and store pointer to it in the inode. This creates a requirement that any AIO DIO using io_end must be protected by i_mutex. This created problems in the past with dioread_nolock mode which was corrupting io_end pointers. Also io_end is allocated unnecessarily in case where we don't need to convert any extents (which is a common case for example when overwriting file). We fix the problem by allocating io_end only once we return unwritten extent from block mapping function for AIO DIO (so we can save some pointless io_end allocations) and we pass pointer to it in bh->b_private which generic DIO code later passes to our end IO callback. That way we remove any need for global pointer to io_end structure and thus fix the races. The downside of this change is that the checking for unwritten IO in flight in ext4_extents_can_be_merged() is more racy since we now increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after dropping i_data_sem. However the check has been racy already before because ext4_writepages() already increment i_unwritten after dropping i_data_sem and reserved blocks save us from hitting ENOSPC in the worst case. Signed-off-by: Jan Kara --- fs/ext4/ext4.h | 10 ---- fs/ext4/extents.c | 35 +++----------- fs/ext4/inode.c | 133 ++++++++++++++++++++++++++++-------------------------- 3 files changed, 74 insertions(+), 104 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d60db18edb4a..9f3156c38b16 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1513,16 +1513,6 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode, } } -static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode) -{ - return inode->i_private; -} - -static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io) -{ - inode->i_private = io; -} - /* * Inode dynamic state flags */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 73a48c1657a1..f1b3c2586a1a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1736,6 +1736,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, */ if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) return 0; + /* + * The check for IO to unwritten extent is somewhat racy as we + * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after + * dropping i_data_sem. But reserved blocks should save us in that + * case. + */ if (ext4_ext_is_unwritten(ex1) && (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || atomic_read(&EXT4_I(inode)->i_unwritten) || @@ -4007,7 +4013,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, struct ext4_ext_path *path = *ppath; int ret = 0; int err = 0; - ext4_io_end_t *io = ext4_inode_aio(inode); ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical " "block %llu, max_blocks %u, flags %x, allocated %u\n", @@ -4030,15 +4035,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, flags | EXT4_GET_BLOCKS_CONVERT); if (ret <= 0) goto out; - /* - * Flag the inode(non aio case) or end_io struct (aio case) - * that this IO needs to conversion to written when IO is - * completed - */ - if (io) - ext4_set_io_unwritten_flag(inode, io); - else - ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); map->m_flags |= EXT4_MAP_UNWRITTEN; goto out; } @@ -4283,9 +4279,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; struct ext4_allocation_request ar; - ext4_io_end_t *io = ext4_inode_aio(inode); ext4_lblk_t cluster_offset; - int set_unwritten = 0; bool map_from_cluster = false; ext_debug("blocks %u/%u requested for inode %lu\n", @@ -4482,15 +4476,6 @@ got_allocated_blocks: if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){ ext4_ext_mark_unwritten(&newex); map->m_flags |= EXT4_MAP_UNWRITTEN; - /* - * io_end structure was created for every IO write to an - * unwritten extent. To avoid unnecessary conversion, - * here we flag the IO that really needs the conversion. - * For non asycn direct IO case, flag the inode state - * that we need to perform conversion when IO is done. - */ - if (flags & EXT4_GET_BLOCKS_PRE_IO) - set_unwritten = 1; } err = 0; @@ -4501,14 +4486,6 @@ got_allocated_blocks: err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags); - if (!err && set_unwritten) { - if (io) - ext4_set_io_unwritten_flag(inode, io); - else - ext4_set_inode_state(inode, - EXT4_STATE_DIO_UNWRITTEN); - } - if (err && free_on_err) { int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index aea67d906cd8..c67c16e59a71 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -797,18 +797,16 @@ int ext4_dio_get_block(struct inode *inode, sector_t iblock, } /* - * Get block function for DIO writes when we create unwritten extent if + * Get block function for AIO DIO writes when we create unwritten extent if * blocks are not allocated yet. The extent will be converted to written * after IO is complete. */ -static int ext4_dio_get_block_unwritten(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +static int ext4_dio_get_block_unwritten_async(struct inode *inode, + sector_t iblock, struct buffer_head *bh_result, int create) { handle_t *handle; int ret; - ext4_debug("ext4_dio_get_block_unwritten: inode %lu, create flag %d\n", - inode->i_ino, create); /* We don't expect handle for direct IO */ WARN_ON_ONCE(ext4_journal_current_handle()); @@ -818,16 +816,62 @@ static int ext4_dio_get_block_unwritten(struct inode *inode, sector_t iblock, ret = _ext4_get_block(inode, iblock, bh_result, EXT4_GET_BLOCKS_IO_CREATE_EXT); ext4_journal_stop(handle); - if (!ret && buffer_unwritten(bh_result)) { - ext4_io_end_t *io_end = ext4_inode_aio(inode); + /* + * When doing DIO using unwritten extents, we need io_end to convert + * unwritten extents to written on IO completion. We allocate io_end + * once we spot unwritten extent and store it in b_private. Generic + * DIO code keeps b_private set and furthermore passes the value to + * our completion callback in 'private' argument. + */ + if (!ret && buffer_unwritten(bh_result)) { + if (!bh_result->b_private) { + ext4_io_end_t *io_end; + + io_end = ext4_init_io_end(inode, GFP_KERNEL); + if (!io_end) + return -ENOMEM; + bh_result->b_private = io_end; + ext4_set_io_unwritten_flag(inode, io_end); + } set_buffer_defer_completion(bh_result); - WARN_ON_ONCE(io_end && !(io_end->flag & EXT4_IO_END_UNWRITTEN)); } return ret; } +/* + * Get block function for non-AIO DIO writes when we create unwritten extent if + * blocks are not allocated yet. The extent will be converted to written + * after IO is complete from ext4_ext_direct_IO() function. + */ +static int ext4_dio_get_block_unwritten_sync(struct inode *inode, + sector_t iblock, struct buffer_head *bh_result, int create) +{ + handle_t *handle; + int ret; + + /* We don't expect handle for direct IO */ + WARN_ON_ONCE(ext4_journal_current_handle()); + + handle = start_dio_trans(inode, bh_result); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = _ext4_get_block(inode, iblock, bh_result, + EXT4_GET_BLOCKS_IO_CREATE_EXT); + ext4_journal_stop(handle); + + /* + * Mark inode as having pending DIO writes to unwritten extents. + * ext4_ext_direct_IO() checks this flag and converts extents to + * written. + */ + if (!ret && buffer_unwritten(bh_result)) + ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); + + return ret; +} + static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { @@ -3243,7 +3287,7 @@ out: static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ssize_t size, void *private) { - ext4_io_end_t *io_end = iocb->private; + ext4_io_end_t *io_end = private; /* if not async direct IO just return */ if (!io_end) @@ -3251,10 +3295,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ext_debug("ext4_end_io_dio(): io_end 0x%p " "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", - iocb->private, io_end->inode->i_ino, iocb, offset, - size); + io_end, io_end->inode->i_ino, iocb, offset, size); - iocb->private = NULL; io_end->offset = offset; io_end->size = size; ext4_put_io_end(io_end); @@ -3290,7 +3332,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, get_block_t *get_block_func = NULL; int dio_flags = 0; loff_t final_size = offset + count; - ext4_io_end_t *io_end = NULL; /* Use the old path for reads and writes beyond i_size. */ if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size) @@ -3315,16 +3356,17 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, /* * We could direct write to holes and fallocate. * - * Allocated blocks to fill the hole are marked as - * unwritten to prevent parallel buffered read to expose - * the stale data before DIO complete the data IO. + * Allocated blocks to fill the hole are marked as unwritten to prevent + * parallel buffered read to expose the stale data before DIO complete + * the data IO. * - * As to previously fallocated extents, ext4 get_block will - * just simply mark the buffer mapped but still keep the - * extents unwritten. + * As to previously fallocated extents, ext4 get_block will just simply + * mark the buffer mapped but still keep the extents unwritten. * - * For non AIO case, we will convert those unwritten extents - * to written after return back from blockdev_direct_IO. + * For non AIO case, we will convert those unwritten extents to written + * after return back from blockdev_direct_IO. That way we save us from + * allocating io_end structure and also the overhead of offloading + * the extent convertion to a workqueue. * * For async DIO, the conversion needs to be deferred when the * IO is completed. The ext4 end_io callback function will be @@ -3332,30 +3374,13 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * case, we allocate an io_end structure to hook to the iocb. */ iocb->private = NULL; - if (overwrite) { + if (overwrite) get_block_func = ext4_dio_get_block_overwrite; + else if (is_sync_kiocb(iocb)) { + get_block_func = ext4_dio_get_block_unwritten_sync; + dio_flags = DIO_LOCKING; } else { - ext4_inode_aio_set(inode, NULL); - if (!is_sync_kiocb(iocb)) { - io_end = ext4_init_io_end(inode, GFP_NOFS); - if (!io_end) { - ret = -ENOMEM; - goto retake_lock; - } - /* - * Grab reference for DIO. Will be dropped in - * ext4_end_io_dio() - */ - iocb->private = ext4_get_io_end(io_end); - /* - * we save the io structure for current async direct - * IO, so that later ext4_map_blocks() could flag the - * io structure whether there is a unwritten extents - * needs to be converted when IO is completed. - */ - ext4_inode_aio_set(inode, io_end); - } - get_block_func = ext4_dio_get_block_unwritten; + get_block_func = ext4_dio_get_block_unwritten_async; dio_flags = DIO_LOCKING; } #ifdef CONFIG_EXT4_FS_ENCRYPTION @@ -3370,27 +3395,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, get_block_func, ext4_end_io_dio, NULL, dio_flags); - /* - * Put our reference to io_end. This can free the io_end structure e.g. - * in sync IO case or in case of error. It can even perform extent - * conversion if all bios we submitted finished before we got here. - * Note that in that case iocb->private can be already set to NULL - * here. - */ - if (io_end) { - ext4_inode_aio_set(inode, NULL); - ext4_put_io_end(io_end); - /* - * When no IO was submitted ext4_end_io_dio() was not - * called so we have to put iocb's reference. - */ - if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) { - WARN_ON(iocb->private != io_end); - WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); - ext4_put_io_end(io_end); - iocb->private = NULL; - } - } if (ret > 0 && !overwrite && ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN)) { int err; @@ -3405,7 +3409,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); } -retake_lock: if (iov_iter_rw(iter) == WRITE) inode_dio_end(inode); /* take i_mutex locking again if we do a ovewrite dio */ -- cgit v1.2.3 From 600be30a8bc1d405f791e01dbef84679e14529b8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 8 Mar 2016 23:39:21 -0500 Subject: ext4: remove i_ioend_count Remove counter of pending io ends as it is unused. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 7 +------ fs/ext4/inode.c | 3 --- fs/ext4/page-io.c | 4 ---- fs/ext4/super.c | 1 - 4 files changed, 1 insertion(+), 14 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9f3156c38b16..70b8e0409566 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1024,13 +1024,8 @@ struct ext4_inode_info { * transaction reserved */ struct list_head i_rsv_conversion_list; - /* - * Completed IOs that need unwritten extents handling and don't have - * transaction reserved - */ - atomic_t i_ioend_count; /* Number of outstanding io_end structs */ - atomic_t i_unwritten; /* Nr. of inflight conversions pending */ struct work_struct i_rsv_conversion_work; + atomic_t i_unwritten; /* Nr. of inflight conversions pending */ spinlock_t i_block_reservation_lock; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c67c16e59a71..971892d7c213 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -216,7 +216,6 @@ void ext4_evict_inode(struct inode *inode) } truncate_inode_pages_final(&inode->i_data); - WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); goto no_delete; } @@ -228,8 +227,6 @@ void ext4_evict_inode(struct inode *inode) ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages_final(&inode->i_data); - WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); - /* * Protect us against freezing - iput() caller didn't have to have any * protection against it diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 090b3498638e..349d7aa04fe7 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -128,9 +128,6 @@ static void ext4_release_io_end(ext4_io_end_t *io_end) BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); WARN_ON(io_end->handle); - if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count)) - wake_up_all(ext4_ioend_wq(io_end->inode)); - for (bio = io_end->bio; bio; bio = next_bio) { next_bio = bio->bi_private; ext4_finish_bio(bio); @@ -265,7 +262,6 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) { ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); if (io) { - atomic_inc(&EXT4_I(inode)->i_ioend_count); io->inode = inode; INIT_LIST_HEAD(&io->list); atomic_set(&io->count, 1); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4d5756024e98..de02a9ef45dd 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -942,7 +942,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) spin_lock_init(&ei->i_completed_io_lock); ei->i_sync_tid = 0; ei->i_datasync_tid = 0; - atomic_set(&ei->i_ioend_count, 0); atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); #ifdef CONFIG_EXT4_FS_ENCRYPTION -- cgit v1.2.3 From facab4d9711e7aa3532cb82643803e8f1b9518e8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 9 Mar 2016 22:54:00 -0500 Subject: ext4: return hole from ext4_map_blocks() Currently, ext4_map_blocks() just returns 0 when it finds a hole and allocation is not requested. However we have all the information available to tell how large the hole actually is and there are callers of ext4_map_blocks() which would save some block-by-block hole iteration if they knew this information. So fill in struct ext4_map_blocks even for holes with the information we have. We keep returning 0 for holes to maintain backward compatibility of the function. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 11 +++++++++-- fs/ext4/indirect.c | 19 +++++++++++++++++-- fs/ext4/inode.c | 15 ++++++++++----- 3 files changed, 36 insertions(+), 9 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7858fc1d2501..2730039d4742 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4374,13 +4374,20 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { ext4_lblk_t hole_start, hole_len; + hole_start = map->m_lblk; + hole_len = ext4_ext_determine_hole(inode, path, &hole_start); /* * put just found gap into cache to speed up * subsequent requests */ - hole_start = map->m_lblk; - hole_len = ext4_ext_determine_hole(inode, path, &hole_start); ext4_ext_put_gap_in_cache(inode, hole_start, hole_len); + + /* Update hole_len to reflect hole size after map->m_lblk */ + if (hole_start != map->m_lblk) + hole_len -= map->m_lblk - hole_start; + map->m_pblk = 0; + map->m_len = min_t(unsigned int, map->m_len, hole_len); + goto out2; } diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 1655ff195e0e..3027fa681de5 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -555,8 +555,23 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, goto got_it; } - /* Next simple case - plain lookup or failed read of indirect block */ - if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) + /* Next simple case - plain lookup failed */ + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { + unsigned epb = inode->i_sb->s_blocksize / sizeof(u32); + int i; + + /* Count number blocks in a subtree under 'partial' */ + count = 1; + for (i = 0; partial + i != chain + depth - 1; i++) + count *= epb; + /* Fill in size of a hole we found */ + map->m_pblk = 0; + map->m_len = min_t(unsigned int, map->m_len, count); + goto cleanup; + } + + /* Failed read of indirect block */ + if (err == -EIO) goto cleanup; /* diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 971892d7c213..1a156ec043a0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -455,13 +455,13 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping * based files * - * On success, it returns the number of blocks being mapped or allocated. - * if create==0 and the blocks are pre-allocated and unwritten block, - * the result buffer head is unmapped. If the create ==1, it will make sure - * the buffer head is mapped. + * On success, it returns the number of blocks being mapped or allocated. if + * create==0 and the blocks are pre-allocated and unwritten, the resulting @map + * is marked as unwritten. If the create == 1, it will mark @map as mapped. * * It returns 0 if plain look up failed (blocks have not been allocated), in - * that case, buffer head is unmapped + * that case, @map is returned as unmapped but we still do fill map->m_len to + * indicate the length of a hole starting at map->m_lblk. * * It returns the error in case of allocation failure. */ @@ -504,6 +504,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, retval = map->m_len; map->m_len = retval; } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { + map->m_pblk = 0; + retval = es.es_len - (map->m_lblk - es.es_lblk); + if (retval > map->m_len) + retval = map->m_len; + map->m_len = retval; retval = 0; } else { BUG_ON(1); -- cgit v1.2.3 From e3fb8eb14eafd2847c04cf48b52a705c36f4db98 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 9 Mar 2016 23:03:27 -0500 Subject: ext4: cleanup handling of bh->b_state in DAX mmap ext4_dax_mmap_get_block() updates bh->b_state directly instead of using ext4_update_bh_state(). This is mostly a cosmetic issue since DAX code always passes on-stack buffer_head but clean this up to make code more uniform. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1a156ec043a0..fddc6ddc53a8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3272,13 +3272,12 @@ out: WARN_ON_ONCE(ret == 0 && create); if (ret > 0) { map_bh(bh_result, inode->i_sb, map.m_pblk); - bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) | - map.m_flags; /* * At least for now we have to clear BH_New so that DAX code * doesn't attempt to zero blocks again in a racy way. */ - bh_result->b_state &= ~(1 << BH_New); + map.m_flags &= ~EXT4_MAP_NEW; + ext4_update_bh_state(bh_result, map.m_flags); bh_result->b_size = map.m_len << inode->i_blkbits; ret = 0; } -- cgit v1.2.3 From 2d90c160e5f1d784e180f1e1458d56eee4d7f4f4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 9 Mar 2016 23:11:13 -0500 Subject: ext4: more efficient SEEK_DATA implementation Using SEEK_DATA in a huge sparse file can easily lead to sotflockups as ext4_seek_data() iterates hole block-by-block. Fix the problem by using returned hole size from ext4_map_blocks() and thus skip the hole in one go. Update also SEEK_HOLE implementation to follow the same pattern as SEEK_DATA to make future maintenance easier. Furthermore we add cond_resched() to both ext4_seek_data() and ext4_seek_hole() to avoid softlockups in case evil user creates huge fragmented file and we have to go through lots of extents. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 3 ++ fs/ext4/file.c | 97 +++++++++++++++++++++------------------------------------ fs/ext4/inode.c | 67 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+), 61 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 70b8e0409566..5623eec7fd22 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2546,6 +2546,9 @@ extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, ext4_lblk_t len); +extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk, + unsigned int map_len, + struct extent_status *result); /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 4a1153561580..e93a7efaf78f 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -426,7 +426,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) */ static int ext4_find_unwritten_pgoff(struct inode *inode, int whence, - struct ext4_map_blocks *map, + ext4_lblk_t end_blk, loff_t *offset) { struct pagevec pvec; @@ -441,7 +441,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, blkbits = inode->i_sb->s_blocksize_bits; startoff = *offset; lastoff = startoff; - endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits; + endoff = (loff_t)end_blk << blkbits; index = startoff >> PAGE_CACHE_SHIFT; end = endoff >> PAGE_CACHE_SHIFT; @@ -559,12 +559,11 @@ out: static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) { struct inode *inode = file->f_mapping->host; - struct ext4_map_blocks map; struct extent_status es; ext4_lblk_t start, last, end; loff_t dataoff, isize; int blkbits; - int ret = 0; + int ret; inode_lock(inode); @@ -581,41 +580,32 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) dataoff = offset; do { - map.m_lblk = last; - map.m_len = end - last + 1; - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { - if (last != start) - dataoff = (loff_t)last << blkbits; - break; + ret = ext4_get_next_extent(inode, last, end - last + 1, &es); + if (ret <= 0) { + /* No extent found -> no data */ + if (ret == 0) + ret = -ENXIO; + inode_unlock(inode); + return ret; } - /* - * If there is a delay extent at this offset, - * it will be as a data. - */ - ext4_es_find_delayed_extent_range(inode, last, last, &es); - if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { - if (last != start) - dataoff = (loff_t)last << blkbits; + last = es.es_lblk; + if (last != start) + dataoff = (loff_t)last << blkbits; + if (!ext4_es_is_unwritten(&es)) break; - } /* * If there is a unwritten extent at this offset, * it will be as a data or a hole according to page * cache that has data or not. */ - if (map.m_flags & EXT4_MAP_UNWRITTEN) { - int unwritten; - unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA, - &map, &dataoff); - if (unwritten) - break; - } - - last++; + if (ext4_find_unwritten_pgoff(inode, SEEK_DATA, + es.es_lblk + es.es_len, &dataoff)) + break; + last += es.es_len; dataoff = (loff_t)last << blkbits; + cond_resched(); } while (last <= end); inode_unlock(inode); @@ -632,12 +622,11 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) { struct inode *inode = file->f_mapping->host; - struct ext4_map_blocks map; struct extent_status es; ext4_lblk_t start, last, end; loff_t holeoff, isize; int blkbits; - int ret = 0; + int ret; inode_lock(inode); @@ -654,44 +643,30 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) holeoff = offset; do { - map.m_lblk = last; - map.m_len = end - last + 1; - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { - last += ret; - holeoff = (loff_t)last << blkbits; - continue; + ret = ext4_get_next_extent(inode, last, end - last + 1, &es); + if (ret < 0) { + inode_unlock(inode); + return ret; } - - /* - * If there is a delay extent at this offset, - * we will skip this extent. - */ - ext4_es_find_delayed_extent_range(inode, last, last, &es); - if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { - last = es.es_lblk + es.es_len; - holeoff = (loff_t)last << blkbits; - continue; + /* Found a hole? */ + if (ret == 0 || es.es_lblk > last) { + if (last != start) + holeoff = (loff_t)last << blkbits; + break; } - /* * If there is a unwritten extent at this offset, * it will be as a data or a hole according to page * cache that has data or not. */ - if (map.m_flags & EXT4_MAP_UNWRITTEN) { - int unwritten; - unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE, - &map, &holeoff); - if (!unwritten) { - last += ret; - holeoff = (loff_t)last << blkbits; - continue; - } - } + if (ext4_es_is_unwritten(&es) && + ext4_find_unwritten_pgoff(inode, SEEK_HOLE, + last + es.es_len, &holeoff)) + break; - /* find a hole */ - break; + last += es.es_len; + holeoff = (loff_t)last << blkbits; + cond_resched(); } while (last <= end); inode_unlock(inode); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index fddc6ddc53a8..ce2c4c62386f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5596,3 +5596,70 @@ int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return err; } + +/* + * Find the first extent at or after @lblk in an inode that is not a hole. + * Search for @map_len blocks at most. The extent is returned in @result. + * + * The function returns 1 if we found an extent. The function returns 0 in + * case there is no extent at or after @lblk and in that case also sets + * @result->es_len to 0. In case of error, the error code is returned. + */ +int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk, + unsigned int map_len, struct extent_status *result) +{ + struct ext4_map_blocks map; + struct extent_status es = {}; + int ret; + + map.m_lblk = lblk; + map.m_len = map_len; + + /* + * For non-extent based files this loop may iterate several times since + * we do not determine full hole size. + */ + while (map.m_len > 0) { + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + return ret; + /* There's extent covering m_lblk? Just return it. */ + if (ret > 0) { + int status; + + ext4_es_store_pblock(result, map.m_pblk); + result->es_lblk = map.m_lblk; + result->es_len = map.m_len; + if (map.m_flags & EXT4_MAP_UNWRITTEN) + status = EXTENT_STATUS_UNWRITTEN; + else + status = EXTENT_STATUS_WRITTEN; + ext4_es_store_status(result, status); + return 1; + } + ext4_es_find_delayed_extent_range(inode, map.m_lblk, + map.m_lblk + map.m_len - 1, + &es); + /* Is delalloc data before next block in extent tree? */ + if (es.es_len && es.es_lblk < map.m_lblk + map.m_len) { + ext4_lblk_t offset = 0; + + if (es.es_lblk < lblk) + offset = lblk - es.es_lblk; + result->es_lblk = es.es_lblk + offset; + ext4_es_store_pblock(result, + ext4_es_pblock(&es) + offset); + result->es_len = es.es_len - offset; + ext4_es_store_status(result, ext4_es_status(&es)); + + return 1; + } + /* There's a hole at m_lblk, advance us after it */ + map.m_lblk += map.m_len; + map_len -= map.m_len; + map.m_len = map_len; + cond_resched(); + } + result->es_len = 0; + return 0; +} -- cgit v1.2.3 From 5e1021f2b6dff1a86a468a1424d59faae2bc63c1 Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Sat, 12 Mar 2016 21:40:32 -0500 Subject: ext4: fix NULL pointer dereference in ext4_mark_inode_dirty() ext4_reserve_inode_write() in ext4_mark_inode_dirty() could fail on error (e.g. EIO) and iloc.bh can be NULL in this case. But the error is ignored in the following "if" condition and ext4_expand_extra_isize() might be called with NULL iloc.bh set, which triggers NULL pointer dereference. This is uncovered by commit 8b4953e13f4c ("ext4: reserve code points for the project quota feature"), which enlarges the ext4_inode size, and run the following script on new kernel but with old mke2fs: #/bin/bash mnt=/mnt/ext4 devname=ext4-error dev=/dev/mapper/$devname fsimg=/home/fs.img trap cleanup 0 1 2 3 9 15 cleanup() { umount $mnt >/dev/null 2>&1 dmsetup remove $devname losetup -d $backend_dev rm -f $fsimg exit 0 } rm -f $fsimg fallocate -l 1g $fsimg backend_dev=`losetup -f --show $fsimg` devsize=`blockdev --getsz $backend_dev` good_tab="0 $devsize linear $backend_dev 0" error_tab="0 $devsize error $backend_dev 0" dmsetup create $devname --table "$good_tab" mkfs -t ext4 $dev mount -t ext4 -o errors=continue,strictatime $dev $mnt dmsetup load $devname --table "$error_tab" && dmsetup resume $devname echo 3 > /proc/sys/vm/drop_caches ls -l $mnt exit 0 [ Patch changed to simplify the function a tiny bit. -- Ted ] Signed-off-by: Eryu Guan Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ce2c4c62386f..b41432efcc09 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5312,6 +5312,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) might_sleep(); trace_ext4_mark_inode_dirty(inode, _RET_IP_); err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + return err; if (ext4_handle_valid(handle) && EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { @@ -5342,9 +5344,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) } } } - if (!err) - err = ext4_mark_iloc_dirty(handle, inode, &iloc); - return err; + return ext4_mark_iloc_dirty(handle, inode, &iloc); } /* -- cgit v1.2.3