diff options
Diffstat (limited to 'fs/btrfs')
-rw-r--r-- | fs/btrfs/async-thread.c | 14 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 14 | ||||
-rw-r--r-- | fs/btrfs/file.c | 139 | ||||
-rw-r--r-- | fs/btrfs/ioctl.c | 4 | ||||
-rw-r--r-- | fs/btrfs/lzo.c | 13 | ||||
-rw-r--r-- | fs/btrfs/scrub.c | 4 | ||||
-rw-r--r-- | fs/btrfs/volumes.c | 18 | ||||
-rw-r--r-- | fs/btrfs/zstd.c | 68 |
8 files changed, 215 insertions, 59 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 309516e6a968..43c89952b7d2 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -234,6 +234,13 @@ static void run_ordered_work(struct __btrfs_workqueue *wq, ordered_list); if (!test_bit(WORK_DONE_BIT, &work->flags)) break; + /* + * Orders all subsequent loads after reading WORK_DONE_BIT, + * paired with the smp_mb__before_atomic in btrfs_work_helper + * this guarantees that the ordered function will see all + * updates from ordinary work function. + */ + smp_rmb(); /* * we are going to call the ordered done function, but @@ -317,6 +324,13 @@ static void btrfs_work_helper(struct work_struct *normal_work) thresh_exec_hook(wq); work->func(work); if (need_order) { + /* + * Ensures all memory accesses done in the work function are + * ordered before setting the WORK_DONE_BIT. Ensuring the thread + * which is going to executed the ordered work sees them. + * Pairs with the smp_rmb in run_ordered_work. + */ + smp_mb__before_atomic(); set_bit(WORK_DONE_BIT, &work->flags); run_ordered_work(wq, work); } else { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 59c3be8c1f4c..514ead6e93b6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3978,11 +3978,23 @@ static void btrfs_end_empty_barrier(struct bio *bio) */ static void write_dev_flush(struct btrfs_device *device) { - struct request_queue *q = bdev_get_queue(device->bdev); struct bio *bio = device->flush_bio; +#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY + /* + * When a disk has write caching disabled, we skip submission of a bio + * with flush and sync requests before writing the superblock, since + * it's not needed. However when the integrity checker is enabled, this + * results in reports that there are metadata blocks referred by a + * superblock that were not properly flushed. So don't skip the bio + * submission only when the integrity checker is enabled for the sake + * of simplicity, since this is a debug tool and not meant for use in + * non-debug builds. + */ + struct request_queue *q = bdev_get_queue(device->bdev); if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) return; +#endif bio_reset(bio); bio->bi_end_io = btrfs_end_empty_barrier; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 581662d16b72..11204dbbe053 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1912,16 +1912,17 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) { + const bool is_sync_write = (iocb->ki_flags & IOCB_DSYNC); struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); loff_t pos; ssize_t written = 0; ssize_t written_buffered; + size_t prev_left = 0; loff_t endbyte; ssize_t err; unsigned int ilock_flags = 0; - struct iomap_dio *dio = NULL; if (iocb->ki_flags & IOCB_NOWAIT) ilock_flags |= BTRFS_ILOCK_TRY; @@ -1964,23 +1965,80 @@ relock: goto buffered; } - dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - 0, 0); + /* + * We remove IOCB_DSYNC so that we don't deadlock when iomap_dio_rw() + * calls generic_write_sync() (through iomap_dio_complete()), because + * that results in calling fsync (btrfs_sync_file()) which will try to + * lock the inode in exclusive/write mode. + */ + if (is_sync_write) + iocb->ki_flags &= ~IOCB_DSYNC; - btrfs_inode_unlock(inode, ilock_flags); + /* + * The iov_iter can be mapped to the same file range we are writing to. + * If that's the case, then we will deadlock in the iomap code, because + * it first calls our callback btrfs_dio_iomap_begin(), which will create + * an ordered extent, and after that it will fault in the pages that the + * iov_iter refers to. During the fault in we end up in the readahead + * pages code (starting at btrfs_readahead()), which will lock the range, + * find that ordered extent and then wait for it to complete (at + * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since + * obviously the ordered extent can never complete as we didn't submit + * yet the respective bio(s). This always happens when the buffer is + * memory mapped to the same file range, since the iomap DIO code always + * invalidates pages in the target file range (after starting and waiting + * for any writeback). + * + * So here we disable page faults in the iov_iter and then retry if we + * got -EFAULT, faulting in the pages before the retry. + */ +again: + from->nofault = true; + err = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops, + IOMAP_DIO_PARTIAL, written); + from->nofault = false; - if (IS_ERR_OR_NULL(dio)) { - err = PTR_ERR_OR_ZERO(dio); - if (err < 0 && err != -ENOTBLK) - goto out; - } else { - written = iomap_dio_complete(dio); + /* No increment (+=) because iomap returns a cumulative value. */ + if (err > 0) + written = err; + + if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) { + const size_t left = iov_iter_count(from); + /* + * We have more data left to write. Try to fault in as many as + * possible of the remainder pages and retry. We do this without + * releasing and locking again the inode, to prevent races with + * truncate. + * + * Also, in case the iov refers to pages in the file range of the + * file we want to write to (due to a mmap), we could enter an + * infinite loop if we retry after faulting the pages in, since + * iomap will invalidate any pages in the range early on, before + * it tries to fault in the pages of the iov. So we keep track of + * how much was left of iov in the previous EFAULT and fallback + * to buffered IO in case we haven't made any progress. + */ + if (left == prev_left) { + err = -ENOTBLK; + } else { + fault_in_iov_iter_readable(from, left); + prev_left = left; + goto again; + } } - if (written < 0 || !iov_iter_count(from)) { - err = written; + btrfs_inode_unlock(inode, ilock_flags); + + /* + * Add back IOCB_DSYNC. Our caller, btrfs_file_write_iter(), will do + * the fsync (call generic_write_sync()). + */ + if (is_sync_write) + iocb->ki_flags |= IOCB_DSYNC; + + /* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */ + if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from)) goto out; - } buffered: pos = iocb->ki_pos; @@ -2005,7 +2063,7 @@ buffered: invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT, endbyte >> PAGE_SHIFT); out: - return written ? written : err; + return err < 0 ? err : written; } static ssize_t btrfs_file_write_iter(struct kiocb *iocb, @@ -3659,6 +3717,8 @@ static int check_direct_read(struct btrfs_fs_info *fs_info, static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); + size_t prev_left = 0; + ssize_t read = 0; ssize_t ret; if (fsverity_active(inode)) @@ -3668,10 +3728,57 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) return 0; btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); +again: + /* + * This is similar to what we do for direct IO writes, see the comment + * at btrfs_direct_write(), but we also disable page faults in addition + * to disabling them only at the iov_iter level. This is because when + * reading from a hole or prealloc extent, iomap calls iov_iter_zero(), + * which can still trigger page fault ins despite having set ->nofault + * to true of our 'to' iov_iter. + * + * The difference to direct IO writes is that we deadlock when trying + * to lock the extent range in the inode's tree during he page reads + * triggered by the fault in (while for writes it is due to waiting for + * our own ordered extent). This is because for direct IO reads, + * btrfs_dio_iomap_begin() returns with the extent range locked, which + * is only unlocked in the endio callback (end_bio_extent_readpage()). + */ + pagefault_disable(); + to->nofault = true; ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - 0, 0); + IOMAP_DIO_PARTIAL, read); + to->nofault = false; + pagefault_enable(); + + /* No increment (+=) because iomap returns a cumulative value. */ + if (ret > 0) + read = ret; + + if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) { + const size_t left = iov_iter_count(to); + + if (left == prev_left) { + /* + * We didn't make any progress since the last attempt, + * fallback to a buffered read for the remainder of the + * range. This is just to avoid any possibility of looping + * for too long. + */ + ret = read; + } else { + /* + * We made some progress since the last retry or this is + * the first time we are retrying. Fault in as many pages + * as possible and retry. + */ + fault_in_iov_iter_writeable(to, left); + prev_left = left; + goto again; + } + } btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); - return ret; + return ret < 0 ? ret : read; } static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fb8cc9642ac4..92138ac2a4e2 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3985,6 +3985,10 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) bool need_unlock; /* for mut. excl. ops lock */ int ret; + if (!arg) + btrfs_warn(fs_info, + "IOC_BALANCE ioctl (v1) is deprecated and will be removed in kernel 5.18"); + if (!capable(CAP_SYS_ADMIN)) return -EPERM; diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 65cb0766e62d..9febb8025825 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -125,6 +125,7 @@ static inline size_t read_compress_length(const char *buf) static int copy_compressed_data_to_page(char *compressed_data, size_t compressed_size, struct page **out_pages, + unsigned long max_nr_page, u32 *cur_out, const u32 sectorsize) { @@ -133,6 +134,9 @@ static int copy_compressed_data_to_page(char *compressed_data, struct page *cur_page; char *kaddr; + if ((*cur_out / PAGE_SIZE) >= max_nr_page) + return -E2BIG; + /* * We never allow a segment header crossing sector boundary, previous * run should ensure we have enough space left inside the sector. @@ -161,6 +165,10 @@ static int copy_compressed_data_to_page(char *compressed_data, orig_out + compressed_size - *cur_out); kunmap(cur_page); + + if ((*cur_out / PAGE_SIZE) >= max_nr_page) + return -E2BIG; + cur_page = out_pages[*cur_out / PAGE_SIZE]; /* Allocate a new page */ if (!cur_page) { @@ -203,6 +211,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, const u32 sectorsize = btrfs_sb(mapping->host->i_sb)->sectorsize; struct page *page_in = NULL; char *sizes_ptr; + const unsigned long max_nr_page = *out_pages; int ret = 0; /* Points to the file offset of input data */ u64 cur_in = start; @@ -210,6 +219,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, u32 cur_out = 0; u32 len = *total_out; + ASSERT(max_nr_page > 0); *out_pages = 0; *total_out = 0; *total_in = 0; @@ -248,7 +258,8 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, } ret = copy_compressed_data_to_page(workspace->cbuf, out_len, - pages, &cur_out, sectorsize); + pages, max_nr_page, + &cur_out, sectorsize); if (ret < 0) goto out; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index cf82ea6f54fb..8f6ceea33969 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -73,8 +73,8 @@ struct scrub_page { u64 physical_for_dev_replace; atomic_t refs; u8 mirror_num; - int have_csum:1; - int io_error:1; + unsigned int have_csum:1; + unsigned int io_error:1; u8 csum[BTRFS_CSUM_SIZE]; struct scrub_recover *recover; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 61ac57bcbf1a..0997e3cd74e9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7559,6 +7559,19 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) fs_info->fs_devices->total_rw_bytes = 0; /* + * Lockdep complains about possible circular locking dependency between + * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores + * used for freeze procection of a fs (struct super_block.s_writers), + * which we take when starting a transaction, and extent buffers of the + * chunk tree if we call read_one_dev() while holding a lock on an + * extent buffer of the chunk tree. Since we are mounting the filesystem + * and at this point there can't be any concurrent task modifying the + * chunk tree, to keep it simple, just skip locking on the chunk tree. + */ + ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags)); + path->skip_locking = 1; + + /* * Read all device items, and then all the chunk items. All * device items are found before any chunk item (their object id * is smaller than the lowest possible object id for a chunk @@ -7583,10 +7596,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) goto error; break; } - /* - * The nodes on level 1 are not locked but we don't need to do - * that during mount time as nothing else can access the tree - */ node = path->nodes[1]; if (node) { if (last_ra_node != node->start) { @@ -7614,7 +7623,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) * requirement for chunk allocation, see the comment on * top of btrfs_chunk_alloc() for details. */ - ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags)); chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); ret = read_one_chunk(&found_key, leaf, chunk); if (ret) diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index f06b68040352..fc42dd0badd7 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -28,10 +28,10 @@ /* 307s to avoid pathologically clashing with transaction commit */ #define ZSTD_BTRFS_RECLAIM_JIFFIES (307 * HZ) -static ZSTD_parameters zstd_get_btrfs_parameters(unsigned int level, +static zstd_parameters zstd_get_btrfs_parameters(unsigned int level, size_t src_len) { - ZSTD_parameters params = ZSTD_getParams(level, src_len, 0); + zstd_parameters params = zstd_get_params(level, src_len); if (params.cParams.windowLog > ZSTD_BTRFS_MAX_WINDOWLOG) params.cParams.windowLog = ZSTD_BTRFS_MAX_WINDOWLOG; @@ -48,8 +48,8 @@ struct workspace { unsigned long last_used; /* jiffies */ struct list_head list; struct list_head lru_list; - ZSTD_inBuffer in_buf; - ZSTD_outBuffer out_buf; + zstd_in_buffer in_buf; + zstd_out_buffer out_buf; }; /* @@ -155,12 +155,12 @@ static void zstd_calc_ws_mem_sizes(void) unsigned int level; for (level = 1; level <= ZSTD_BTRFS_MAX_LEVEL; level++) { - ZSTD_parameters params = + zstd_parameters params = zstd_get_btrfs_parameters(level, ZSTD_BTRFS_MAX_INPUT); size_t level_size = max_t(size_t, - ZSTD_CStreamWorkspaceBound(params.cParams), - ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT)); + zstd_cstream_workspace_bound(¶ms.cParams), + zstd_dstream_workspace_bound(ZSTD_BTRFS_MAX_INPUT)); max_size = max_t(size_t, max_size, level_size); zstd_ws_mem_sizes[level - 1] = max_size; @@ -371,7 +371,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); - ZSTD_CStream *stream; + zstd_cstream *stream; int ret = 0; int nr_pages = 0; struct page *in_page = NULL; /* The current page to read */ @@ -381,7 +381,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, unsigned long len = *total_out; const unsigned long nr_dest_pages = *out_pages; unsigned long max_out = nr_dest_pages * PAGE_SIZE; - ZSTD_parameters params = zstd_get_btrfs_parameters(workspace->req_level, + zstd_parameters params = zstd_get_btrfs_parameters(workspace->req_level, len); *out_pages = 0; @@ -389,10 +389,10 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, *total_in = 0; /* Initialize the stream */ - stream = ZSTD_initCStream(params, len, workspace->mem, + stream = zstd_init_cstream(¶ms, len, workspace->mem, workspace->size); if (!stream) { - pr_warn("BTRFS: ZSTD_initCStream failed\n"); + pr_warn("BTRFS: zstd_init_cstream failed\n"); ret = -EIO; goto out; } @@ -418,11 +418,11 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, while (1) { size_t ret2; - ret2 = ZSTD_compressStream(stream, &workspace->out_buf, + ret2 = zstd_compress_stream(stream, &workspace->out_buf, &workspace->in_buf); - if (ZSTD_isError(ret2)) { - pr_debug("BTRFS: ZSTD_compressStream returned %d\n", - ZSTD_getErrorCode(ret2)); + if (zstd_is_error(ret2)) { + pr_debug("BTRFS: zstd_compress_stream returned %d\n", + zstd_get_error_code(ret2)); ret = -EIO; goto out; } @@ -487,10 +487,10 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, while (1) { size_t ret2; - ret2 = ZSTD_endStream(stream, &workspace->out_buf); - if (ZSTD_isError(ret2)) { - pr_debug("BTRFS: ZSTD_endStream returned %d\n", - ZSTD_getErrorCode(ret2)); + ret2 = zstd_end_stream(stream, &workspace->out_buf); + if (zstd_is_error(ret2)) { + pr_debug("BTRFS: zstd_end_stream returned %d\n", + zstd_get_error_code(ret2)); ret = -EIO; goto out; } @@ -548,17 +548,17 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) struct workspace *workspace = list_entry(ws, struct workspace, list); struct page **pages_in = cb->compressed_pages; size_t srclen = cb->compressed_len; - ZSTD_DStream *stream; + zstd_dstream *stream; int ret = 0; unsigned long page_in_index = 0; unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; unsigned long total_out = 0; - stream = ZSTD_initDStream( + stream = zstd_init_dstream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); if (!stream) { - pr_debug("BTRFS: ZSTD_initDStream failed\n"); + pr_debug("BTRFS: zstd_init_dstream failed\n"); ret = -EIO; goto done; } @@ -574,11 +574,11 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) while (1) { size_t ret2; - ret2 = ZSTD_decompressStream(stream, &workspace->out_buf, + ret2 = zstd_decompress_stream(stream, &workspace->out_buf, &workspace->in_buf); - if (ZSTD_isError(ret2)) { - pr_debug("BTRFS: ZSTD_decompressStream returned %d\n", - ZSTD_getErrorCode(ret2)); + if (zstd_is_error(ret2)) { + pr_debug("BTRFS: zstd_decompress_stream returned %d\n", + zstd_get_error_code(ret2)); ret = -EIO; goto done; } @@ -624,16 +624,16 @@ int zstd_decompress(struct list_head *ws, unsigned char *data_in, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); - ZSTD_DStream *stream; + zstd_dstream *stream; int ret = 0; size_t ret2; unsigned long total_out = 0; unsigned long pg_offset = 0; - stream = ZSTD_initDStream( + stream = zstd_init_dstream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); if (!stream) { - pr_warn("BTRFS: ZSTD_initDStream failed\n"); + pr_warn("BTRFS: zstd_init_dstream failed\n"); ret = -EIO; goto finish; } @@ -657,15 +657,15 @@ int zstd_decompress(struct list_head *ws, unsigned char *data_in, /* Check if the frame is over and we still need more input */ if (ret2 == 0) { - pr_debug("BTRFS: ZSTD_decompressStream ended early\n"); + pr_debug("BTRFS: zstd_decompress_stream ended early\n"); ret = -EIO; goto finish; } - ret2 = ZSTD_decompressStream(stream, &workspace->out_buf, + ret2 = zstd_decompress_stream(stream, &workspace->out_buf, &workspace->in_buf); - if (ZSTD_isError(ret2)) { - pr_debug("BTRFS: ZSTD_decompressStream returned %d\n", - ZSTD_getErrorCode(ret2)); + if (zstd_is_error(ret2)) { + pr_debug("BTRFS: zstd_decompress_stream returned %d\n", + zstd_get_error_code(ret2)); ret = -EIO; goto finish; } |