diff options
Diffstat (limited to 'fs/btrfs/file.c')
-rw-r--r-- | fs/btrfs/file.c | 196 |
1 files changed, 157 insertions, 39 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7ff577005d0f..11204dbbe053 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -437,9 +437,15 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, /* * unlocks pages after btrfs_file_write is done with them */ -static void btrfs_drop_pages(struct page **pages, size_t num_pages) +static void btrfs_drop_pages(struct btrfs_fs_info *fs_info, + struct page **pages, size_t num_pages, + u64 pos, u64 copied) { size_t i; + u64 block_start = round_down(pos, fs_info->sectorsize); + u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start; + + ASSERT(block_len <= U32_MAX); for (i = 0; i < num_pages; i++) { /* page checked is some magic around finding pages that * have been modified without going through btrfs_set_page_dirty @@ -447,7 +453,8 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages) * accessed as prepare_pages should have marked them accessed * in prepare_pages via find_or_create_page() */ - ClearPageChecked(pages[i]); + btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start, + block_len); unlock_page(pages[i]); put_page(pages[i]); } @@ -504,7 +511,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, struct page *p = pages[i]; btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes); - ClearPageChecked(p); + btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes); btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes); } @@ -734,8 +741,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, if (args->start >= inode->disk_i_size && !args->replace_extent) modify_tree = 0; - update_refs = (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) || - root == fs_info->tree_root); + update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); while (1) { recow = 0; ret = btrfs_lookup_file_extent(trans, root, path, ino, @@ -870,7 +876,8 @@ next_slot: btrfs_init_data_ref(&ref, root->root_key.objectid, new_key.objectid, - args->start - extent_offset); + args->start - extent_offset, + 0, false); ret = btrfs_inc_extent_ref(trans, &ref); BUG_ON(ret); /* -ENOMEM */ } @@ -956,7 +963,8 @@ delete_extent_item: btrfs_init_data_ref(&ref, root->root_key.objectid, key.objectid, - key.offset - extent_offset); + key.offset - extent_offset, 0, + false); ret = btrfs_free_extent(trans, &ref); BUG_ON(ret); /* -ENOMEM */ args->bytes_found += extent_end - key.offset; @@ -1021,8 +1029,7 @@ delete_extent_item: if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) path->slots[0]++; } - setup_items_for_insert(root, path, &key, - &args->extent_item_size, 1); + btrfs_setup_item_for_insert(root, path, &key, args->extent_item_size); args->extent_inserted = true; } @@ -1233,7 +1240,7 @@ again: btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr, num_bytes, 0); btrfs_init_data_ref(&ref, root->root_key.objectid, ino, - orig_offset); + orig_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1258,7 +1265,8 @@ again: other_end = 0; btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, num_bytes, 0); - btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset); + btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset, + 0, false); if (extent_mergeable(leaf, path->slots[0] + 1, ino, bytenr, orig_offset, &other_start, &other_end)) { @@ -1710,7 +1718,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, * Fault pages before locking them in prepare_pages * to avoid recursive lock */ - if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) { + if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) { ret = -EFAULT; break; } @@ -1845,7 +1853,7 @@ again: btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); if (ret) { - btrfs_drop_pages(pages, num_pages); + btrfs_drop_pages(fs_info, pages, num_pages, pos, copied); break; } @@ -1853,7 +1861,7 @@ again: if (only_release_metadata) btrfs_check_nocow_unlock(BTRFS_I(inode)); - btrfs_drop_pages(pages, num_pages); + btrfs_drop_pages(fs_info, pages, num_pages, pos, copied); cond_resched(); @@ -1904,16 +1912,17 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) { + const bool is_sync_write = (iocb->ki_flags & IOCB_DSYNC); struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); loff_t pos; ssize_t written = 0; ssize_t written_buffered; + size_t prev_left = 0; loff_t endbyte; ssize_t err; unsigned int ilock_flags = 0; - struct iomap_dio *dio = NULL; if (iocb->ki_flags & IOCB_NOWAIT) ilock_flags |= BTRFS_ILOCK_TRY; @@ -1956,23 +1965,80 @@ relock: goto buffered; } - dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - 0); + /* + * We remove IOCB_DSYNC so that we don't deadlock when iomap_dio_rw() + * calls generic_write_sync() (through iomap_dio_complete()), because + * that results in calling fsync (btrfs_sync_file()) which will try to + * lock the inode in exclusive/write mode. + */ + if (is_sync_write) + iocb->ki_flags &= ~IOCB_DSYNC; - btrfs_inode_unlock(inode, ilock_flags); + /* + * The iov_iter can be mapped to the same file range we are writing to. + * If that's the case, then we will deadlock in the iomap code, because + * it first calls our callback btrfs_dio_iomap_begin(), which will create + * an ordered extent, and after that it will fault in the pages that the + * iov_iter refers to. During the fault in we end up in the readahead + * pages code (starting at btrfs_readahead()), which will lock the range, + * find that ordered extent and then wait for it to complete (at + * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since + * obviously the ordered extent can never complete as we didn't submit + * yet the respective bio(s). This always happens when the buffer is + * memory mapped to the same file range, since the iomap DIO code always + * invalidates pages in the target file range (after starting and waiting + * for any writeback). + * + * So here we disable page faults in the iov_iter and then retry if we + * got -EFAULT, faulting in the pages before the retry. + */ +again: + from->nofault = true; + err = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops, + IOMAP_DIO_PARTIAL, written); + from->nofault = false; - if (IS_ERR_OR_NULL(dio)) { - err = PTR_ERR_OR_ZERO(dio); - if (err < 0 && err != -ENOTBLK) - goto out; - } else { - written = iomap_dio_complete(dio); + /* No increment (+=) because iomap returns a cumulative value. */ + if (err > 0) + written = err; + + if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) { + const size_t left = iov_iter_count(from); + /* + * We have more data left to write. Try to fault in as many as + * possible of the remainder pages and retry. We do this without + * releasing and locking again the inode, to prevent races with + * truncate. + * + * Also, in case the iov refers to pages in the file range of the + * file we want to write to (due to a mmap), we could enter an + * infinite loop if we retry after faulting the pages in, since + * iomap will invalidate any pages in the range early on, before + * it tries to fault in the pages of the iov. So we keep track of + * how much was left of iov in the previous EFAULT and fallback + * to buffered IO in case we haven't made any progress. + */ + if (left == prev_left) { + err = -ENOTBLK; + } else { + fault_in_iov_iter_readable(from, left); + prev_left = left; + goto again; + } } - if (written < 0 || !iov_iter_count(from)) { - err = written; + btrfs_inode_unlock(inode, ilock_flags); + + /* + * Add back IOCB_DSYNC. Our caller, btrfs_file_write_iter(), will do + * the fsync (call generic_write_sync()). + */ + if (is_sync_write) + iocb->ki_flags |= IOCB_DSYNC; + + /* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */ + if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from)) goto out; - } buffered: pos = iocb->ki_pos; @@ -1997,7 +2063,7 @@ buffered: invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT, endbyte >> PAGE_SHIFT); out: - return written ? written : err; + return err < 0 ? err : written; } static ssize_t btrfs_file_write_iter(struct kiocb *iocb, @@ -2013,7 +2079,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, * have opened a file as writable, we have to stop this write operation * to ensure consistency. */ - if (test_bit(BTRFS_FS_STATE_ERROR, &inode->root->fs_info->fs_state)) + if (BTRFS_FS_ERROR(inode->root->fs_info)) return -EROFS; if (!(iocb->ki_flags & IOCB_DIRECT) && @@ -2621,7 +2687,7 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, extent_info->disk_len, 0); ref_offset = extent_info->file_offset - extent_info->data_offset; btrfs_init_data_ref(&ref, root->root_key.objectid, - btrfs_ino(inode), ref_offset); + btrfs_ino(inode), ref_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); } @@ -2704,14 +2770,16 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, drop_args.bytes_found); if (ret != -ENOSPC) { /* - * When cloning we want to avoid transaction aborts when - * nothing was done and we are attempting to clone parts - * of inline extents, in such cases -EOPNOTSUPP is - * returned by __btrfs_drop_extents() without having - * changed anything in the file. + * The only time we don't want to abort is if we are + * attempting to clone a partial inline extent, in which + * case we'll get EOPNOTSUPP. However if we aren't + * clone we need to abort no matter what, because if we + * got EOPNOTSUPP via prealloc then we messed up and + * need to abort. */ - if (extent_info && !extent_info->is_new_extent && - ret && ret != -EOPNOTSUPP) + if (ret && + (ret != -EOPNOTSUPP || + (extent_info && extent_info->is_new_extent))) btrfs_abort_transaction(trans, ret); break; } @@ -3649,6 +3717,8 @@ static int check_direct_read(struct btrfs_fs_info *fs_info, static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); + size_t prev_left = 0; + ssize_t read = 0; ssize_t ret; if (fsverity_active(inode)) @@ -3658,9 +3728,57 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) return 0; btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); - ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 0); +again: + /* + * This is similar to what we do for direct IO writes, see the comment + * at btrfs_direct_write(), but we also disable page faults in addition + * to disabling them only at the iov_iter level. This is because when + * reading from a hole or prealloc extent, iomap calls iov_iter_zero(), + * which can still trigger page fault ins despite having set ->nofault + * to true of our 'to' iov_iter. + * + * The difference to direct IO writes is that we deadlock when trying + * to lock the extent range in the inode's tree during he page reads + * triggered by the fault in (while for writes it is due to waiting for + * our own ordered extent). This is because for direct IO reads, + * btrfs_dio_iomap_begin() returns with the extent range locked, which + * is only unlocked in the endio callback (end_bio_extent_readpage()). + */ + pagefault_disable(); + to->nofault = true; + ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, + IOMAP_DIO_PARTIAL, read); + to->nofault = false; + pagefault_enable(); + + /* No increment (+=) because iomap returns a cumulative value. */ + if (ret > 0) + read = ret; + + if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) { + const size_t left = iov_iter_count(to); + + if (left == prev_left) { + /* + * We didn't make any progress since the last attempt, + * fallback to a buffered read for the remainder of the + * range. This is just to avoid any possibility of looping + * for too long. + */ + ret = read; + } else { + /* + * We made some progress since the last retry or this is + * the first time we are retrying. Fault in as many pages + * as possible and retry. + */ + fault_in_iov_iter_writeable(to, left); + prev_left = left; + goto again; + } + } btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); - return ret; + return ret < 0 ? ret : read; } static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) |