From 174e27c607cfa3ebb92934d28c0fdfcf5ce6c3af Mon Sep 17 00:00:00 2001 From: "Chen, Kenneth W" Date: Sat, 25 Mar 2006 03:08:16 -0800 Subject: [PATCH] direct-io: bug fix in dio handling write error There is a bug in direct-io on propagating write error up to the higher I/O layer. When performing an async ODIRECT write to a block device, if a device error occurred (like media error or disk is pulled), the error code is only propagated from device driver to the DIO layer. The error code stops at finished_one_bio(). The aysnc write, however, is supposedly have a corresponding AIO event with appropriate return code (in this case -EIO). Application which waits on the async write event, will hang forever since such AIO event is lost forever (if such app did not use the timeout option in io_getevents call. Regardless, an AIO event is lost). The discovery of above bug leads to another discovery of potential race window with dio->result. The fundamental problem is that dio->result is overloaded with dual use: an indicator of fall back path for partial dio write, and an error indicator used in the I/O completion path. In the event of device error, the setting of -EIO to dio->result clashes with value used to track partial write that activates the fall back path. It was also pointed out that it is impossible to use dio->result to track partial write and at the same time to track error returned from device driver. Because direct_io_work can only determines whether it is a partial write at the end of io submission and in mid stream of those io submission, a return code could be coming back from the driver. Thus messing up all the subsequent logic. Proposed fix is to separating out error code returned by the IO completion path from partial IO submit tracking. A new variable is added to dio structure specifically to track io error returned in the completion path. Signed-off-by: Ken Chen Acked-by: Zach Brown Acked-by: Suparna Bhattacharya Cc: Badari Pulavarty Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/direct-io.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs/direct-io.c') diff --git a/fs/direct-io.c b/fs/direct-io.c index 27f3e787faca..235ed8d1f11e 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -129,6 +129,7 @@ struct dio { /* AIO related stuff */ struct kiocb *iocb; /* kiocb */ int is_async; /* is IO async ? */ + int io_error; /* IO error in completion path */ ssize_t result; /* IO result */ }; @@ -250,6 +251,10 @@ static void finished_one_bio(struct dio *dio) ((offset + transferred) > dio->i_size)) transferred = dio->i_size - offset; + /* check for error in completion path */ + if (dio->io_error) + transferred = dio->io_error; + dio_complete(dio, offset, transferred); /* Complete AIO later if falling back to buffered i/o */ @@ -406,7 +411,7 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio) int page_no; if (!uptodate) - dio->result = -EIO; + dio->io_error = -EIO; if (dio->is_async && dio->rw == READ) { bio_check_pages_dirty(bio); /* transfers ownership */ @@ -971,6 +976,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, dio->next_block_for_io = -1; dio->page_errors = 0; + dio->io_error = 0; dio->result = 0; dio->iocb = iocb; dio->i_size = i_size_read(inode); -- cgit v1.2.3 From 1d8fa7a2b9a39d18727acc5c468e870df606c852 Mon Sep 17 00:00:00 2001 From: Badari Pulavarty Date: Sun, 26 Mar 2006 01:38:02 -0800 Subject: [PATCH] remove ->get_blocks() support Now that get_block() can handle mapping multiple disk blocks, no need to have ->get_blocks(). This patch removes fs specific ->get_blocks() added for DIO and makes it users use get_block() instead. Signed-off-by: Badari Pulavarty Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/block_dev.c | 3 ++- fs/direct-io.c | 27 ++++++++++++++------------- fs/ext2/inode.c | 14 +------------- fs/ext3/inode.c | 12 ++---------- fs/fat/inode.c | 2 +- fs/hfs/inode.c | 13 +------------ fs/hfsplus/inode.c | 13 +------------ fs/jfs/inode.c | 2 +- fs/ocfs2/aops.c | 2 +- fs/reiserfs/inode.c | 1 - fs/xfs/linux-2.6/xfs_aops.c | 6 +++--- include/linux/fs.h | 17 +++++++---------- 12 files changed, 34 insertions(+), 78 deletions(-) (limited to 'fs/direct-io.c') diff --git a/fs/block_dev.c b/fs/block_dev.c index 9a451a9ffad4..5983d42df015 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -131,9 +131,10 @@ blkdev_get_block(struct inode *inode, sector_t iblock, static int blkdev_get_blocks(struct inode *inode, sector_t iblock, - unsigned long max_blocks, struct buffer_head *bh, int create) + struct buffer_head *bh, int create) { sector_t end_block = max_block(I_BDEV(inode)); + unsigned long max_blocks = bh->b_size >> inode->i_blkbits; if ((iblock + max_blocks) > end_block) { max_blocks = end_block - iblock; diff --git a/fs/direct-io.c b/fs/direct-io.c index 235ed8d1f11e..9d1d2aa73e42 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -86,12 +86,12 @@ struct dio { unsigned first_block_in_page; /* doesn't change, Used only once */ int boundary; /* prev block is at a boundary */ int reap_counter; /* rate limit reaping */ - get_blocks_t *get_blocks; /* block mapping function */ + get_block_t *get_block; /* block mapping function */ dio_iodone_t *end_io; /* IO completion function */ sector_t final_block_in_bio; /* current final block in bio + 1 */ sector_t next_block_for_io; /* next block to be put under IO, in dio_blocks units */ - struct buffer_head map_bh; /* last get_blocks() result */ + struct buffer_head map_bh; /* last get_block() result */ /* * Deferred addition of a page to the dio. These variables are @@ -211,9 +211,9 @@ static struct page *dio_get_page(struct dio *dio) /* * Called when all DIO BIO I/O has been completed - let the filesystem - * know, if it registered an interest earlier via get_blocks. Pass the + * know, if it registered an interest earlier via get_block. Pass the * private field of the map buffer_head so that filesystems can use it - * to hold additional state between get_blocks calls and dio_complete. + * to hold additional state between get_block calls and dio_complete. */ static void dio_complete(struct dio *dio, loff_t offset, ssize_t bytes) { @@ -493,7 +493,7 @@ static int dio_bio_reap(struct dio *dio) * The fs is allowed to map lots of blocks at once. If it wants to do that, * it uses the passed inode-relative block number as the file offset, as usual. * - * get_blocks() is passed the number of i_blkbits-sized blocks which direct_io + * get_block() is passed the number of i_blkbits-sized blocks which direct_io * has remaining to do. The fs should not map more than this number of blocks. * * If the fs has mapped a lot of blocks, it should populate bh->b_size to @@ -506,7 +506,7 @@ static int dio_bio_reap(struct dio *dio) * In the case of filesystem holes: the fs may return an arbitrarily-large * hole by returning an appropriate value in b_size and by clearing * buffer_mapped(). However the direct-io code will only process holes one - * block at a time - it will repeatedly call get_blocks() as it walks the hole. + * block at a time - it will repeatedly call get_block() as it walks the hole. */ static int get_more_blocks(struct dio *dio) { @@ -548,7 +548,8 @@ static int get_more_blocks(struct dio *dio) * at a higher level for inside-i_size block-instantiating * writes. */ - ret = (*dio->get_blocks)(dio->inode, fs_startblk, fs_count, + map_bh->b_size = fs_count << dio->blkbits; + ret = (*dio->get_block)(dio->inode, fs_startblk, map_bh, create); } return ret; @@ -783,11 +784,11 @@ static void dio_zero_block(struct dio *dio, int end) * happily perform page-sized but 512-byte aligned IOs. It is important that * blockdev IO be able to have fine alignment and large sizes. * - * So what we do is to permit the ->get_blocks function to populate bh.b_size + * So what we do is to permit the ->get_block function to populate bh.b_size * with the size of IO which is permitted at this offset and this i_blkbits. * * For best results, the blockdev should be set up with 512-byte i_blkbits and - * it should set b_size to PAGE_SIZE or more inside get_blocks(). This gives + * it should set b_size to PAGE_SIZE or more inside get_block(). This gives * fine alignment but still allows this function to work in PAGE_SIZE units. */ static int do_direct_IO(struct dio *dio) @@ -947,7 +948,7 @@ out: static ssize_t direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, const struct iovec *iov, loff_t offset, unsigned long nr_segs, - unsigned blkbits, get_blocks_t get_blocks, dio_iodone_t end_io, + unsigned blkbits, get_block_t get_block, dio_iodone_t end_io, struct dio *dio) { unsigned long user_addr; @@ -969,7 +970,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, dio->boundary = 0; dio->reap_counter = 0; - dio->get_blocks = get_blocks; + dio->get_block = get_block; dio->end_io = end_io; dio->map_bh.b_private = NULL; dio->final_block_in_bio = -1; @@ -1177,7 +1178,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, - unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io, + unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, int dio_lock_type) { int seg; @@ -1273,7 +1274,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, (end > i_size_read(inode))); retval = direct_io_worker(rw, iocb, inode, iov, offset, - nr_segs, blkbits, get_blocks, end_io, dio); + nr_segs, blkbits, get_block, end_io, dio); if (rw == READ && dio_lock_type == DIO_LOCKING) release_i_mutex = 0; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index a717837f272e..04af9c45dce2 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -667,18 +667,6 @@ static sector_t ext2_bmap(struct address_space *mapping, sector_t block) return generic_block_bmap(mapping,block,ext2_get_block); } -static int -ext2_get_blocks(struct inode *inode, sector_t iblock, unsigned long max_blocks, - struct buffer_head *bh_result, int create) -{ - int ret; - - ret = ext2_get_block(inode, iblock, bh_result, create); - if (ret == 0) - bh_result->b_size = (1 << inode->i_blkbits); - return ret; -} - static ssize_t ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) @@ -687,7 +675,7 @@ ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, struct inode *inode = file->f_mapping->host; return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, - offset, nr_segs, ext2_get_blocks, NULL); + offset, nr_segs, ext2_get_block, NULL); } static int diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 34e5b0dc9168..0cd126176bbb 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -940,11 +940,11 @@ out: static int ext3_direct_io_get_blocks(struct inode *inode, sector_t iblock, - unsigned long max_blocks, struct buffer_head *bh_result, int create) { handle_t *handle = journal_current_handle(); int ret = 0; + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; if (!create) goto get_block; /* A read */ @@ -989,18 +989,10 @@ get_block: return ret; } -static int ext3_get_blocks(struct inode *inode, sector_t iblock, - unsigned long maxblocks, struct buffer_head *bh_result, - int create) -{ - return ext3_direct_io_get_blocks(inode, iblock, maxblocks, - bh_result, create); -} - static int ext3_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - return ext3_get_blocks(inode, iblock, 1, bh_result, create); + return ext3_direct_io_get_blocks(inode, iblock, bh_result, create); } /* diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 297300fe81c2..404bfc9f7385 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -101,11 +101,11 @@ static int __fat_get_blocks(struct inode *inode, sector_t iblock, } static int fat_get_blocks(struct inode *inode, sector_t iblock, - unsigned long max_blocks, struct buffer_head *bh_result, int create) { struct super_block *sb = inode->i_sb; int err; + unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; err = __fat_get_blocks(inode, iblock, &max_blocks, bh_result, create); if (err) diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 39fd85b9b916..2c564701724f 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -98,17 +98,6 @@ static int hfs_releasepage(struct page *page, gfp_t mask) return res ? try_to_free_buffers(page) : 0; } -static int hfs_get_blocks(struct inode *inode, sector_t iblock, unsigned long max_blocks, - struct buffer_head *bh_result, int create) -{ - int ret; - - ret = hfs_get_block(inode, iblock, bh_result, create); - if (!ret) - bh_result->b_size = (1 << inode->i_blkbits); - return ret; -} - static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { @@ -116,7 +105,7 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb, struct inode *inode = file->f_dentry->d_inode->i_mapping->host; return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, - offset, nr_segs, hfs_get_blocks, NULL); + offset, nr_segs, hfs_get_block, NULL); } static int hfs_writepages(struct address_space *mapping, diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 12ed2b7d046b..9fbe4d2aeece 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -93,17 +93,6 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask) return res ? try_to_free_buffers(page) : 0; } -static int hfsplus_get_blocks(struct inode *inode, sector_t iblock, unsigned long max_blocks, - struct buffer_head *bh_result, int create) -{ - int ret; - - ret = hfsplus_get_block(inode, iblock, bh_result, create); - if (!ret) - bh_result->b_size = (1 << inode->i_blkbits); - return ret; -} - static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { @@ -111,7 +100,7 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb, struct inode *inode = file->f_dentry->d_inode->i_mapping->host; return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, - offset, nr_segs, hfsplus_get_blocks, NULL); + offset, nr_segs, hfsplus_get_block, NULL); } static int hfsplus_writepages(struct address_space *mapping, diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 7239ef339489..04eb78f1252e 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -302,7 +302,7 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb, struct inode *inode = file->f_mapping->host; return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, - offset, nr_segs, jfs_get_blocks, NULL); + offset, nr_segs, jfs_get_block, NULL); } struct address_space_operations jfs_aops = { diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index bf931ba1d364..0d858d0b25be 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -540,7 +540,6 @@ bail: * fs_count, map_bh, dio->rw == WRITE); */ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, - unsigned long max_blocks, struct buffer_head *bh_result, int create) { int ret; @@ -548,6 +547,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, u64 p_blkno; int contig_blocks; unsigned char blocksize_bits; + unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; if (!inode || !bh_result) { mlog(ML_ERROR, "inode or bh_result is null\n"); diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 62e18c19b446..9857e50f85e7 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -466,7 +466,6 @@ static int reiserfs_get_block_create_0(struct inode *inode, sector_t block, direct_IO request. */ static int reiserfs_get_blocks_direct_io(struct inode *inode, sector_t iblock, - unsigned long max_blocks, struct buffer_head *bh_result, int create) { diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index a79b84f8b55c..c02f7c5b7462 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -1319,12 +1319,12 @@ STATIC int xfs_get_blocks_direct( struct inode *inode, sector_t iblock, - unsigned long max_blocks, struct buffer_head *bh_result, int create) { - return __xfs_get_block(inode, iblock, max_blocks, bh_result, - create, 1, BMAPI_WRITE|BMAPI_DIRECT); + return __xfs_get_block(inode, iblock, + bh_result->b_size >> inode->i_blkbits, + bh_result, create, 1, BMAPI_WRITE|BMAPI_DIRECT); } STATIC void diff --git a/include/linux/fs.h b/include/linux/fs.h index 155d29d5e5e4..9d9674946956 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -252,9 +252,6 @@ extern void __init files_init(unsigned long); struct buffer_head; typedef int (get_block_t)(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); -typedef int (get_blocks_t)(struct inode *inode, sector_t iblock, - unsigned long max_blocks, - struct buffer_head *bh_result, int create); typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, ssize_t bytes, void *private); @@ -1645,7 +1642,7 @@ static inline void do_generic_file_read(struct file * filp, loff_t *ppos, ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, - unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io, + unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, int lock_type); enum { @@ -1656,29 +1653,29 @@ enum { static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, - loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks, + loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io) { return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, - nr_segs, get_blocks, end_io, DIO_LOCKING); + nr_segs, get_block, end_io, DIO_LOCKING); } static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, - loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks, + loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io) { return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, - nr_segs, get_blocks, end_io, DIO_NO_LOCKING); + nr_segs, get_block, end_io, DIO_NO_LOCKING); } static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, - loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks, + loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io) { return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, - nr_segs, get_blocks, end_io, DIO_OWN_LOCKING); + nr_segs, get_block, end_io, DIO_OWN_LOCKING); } extern struct file_operations generic_ro_fops; -- cgit v1.2.3 From 3c674e74238cb2484169e3f84f687c66887086b6 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Wed, 29 Mar 2006 09:26:15 +1000 Subject: Fixes a regression from the recent "remove ->get_blocks() support" change. inode->i_blkbits should be used when making a get_block_t request of a filesystem instead of dio->blkbits, as that does not indicate the filesystem block size all the time (depends on request alignment - see start of __blockdev_direct_IO). Signed-off-by: Nathan Scott Acked-by: Badari Pulavarty --- fs/direct-io.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs/direct-io.c') diff --git a/fs/direct-io.c b/fs/direct-io.c index 9d1d2aa73e42..910a8ed74b5d 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -524,8 +524,6 @@ static int get_more_blocks(struct dio *dio) */ ret = dio->page_errors; if (ret == 0) { - map_bh->b_state = 0; - map_bh->b_size = 0; BUG_ON(dio->block_in_file >= dio->final_block_in_request); fs_startblk = dio->block_in_file >> dio->blkfactor; dio_count = dio->final_block_in_request - dio->block_in_file; @@ -534,6 +532,9 @@ static int get_more_blocks(struct dio *dio) if (dio_count & blkmask) fs_count++; + map_bh->b_state = 0; + map_bh->b_size = fs_count << dio->inode->i_blkbits; + create = dio->rw == WRITE; if (dio->lock_type == DIO_LOCKING) { if (dio->block_in_file < (i_size_read(dio->inode) >> @@ -542,13 +543,13 @@ static int get_more_blocks(struct dio *dio) } else if (dio->lock_type == DIO_NO_LOCKING) { create = 0; } + /* * For writes inside i_size we forbid block creations: only * overwrites are permitted. We fall back to buffered writes * at a higher level for inside-i_size block-instantiating * writes. */ - map_bh->b_size = fs_count << dio->blkbits; ret = (*dio->get_block)(dio->inode, fs_startblk, map_bh, create); } -- cgit v1.2.3