diff options
Diffstat (limited to 'fs/btrfs')
47 files changed, 3772 insertions, 1515 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index c9ee579bc5a6..ebc392ea1d74 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -789,11 +789,13 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, if (IS_ERR(eb)) { free_pref(ref); return PTR_ERR(eb); - } else if (!extent_buffer_uptodate(eb)) { + } + if (!extent_buffer_uptodate(eb)) { free_pref(ref); free_extent_buffer(eb); return -EIO; } + if (lock) btrfs_tree_read_lock(eb); if (btrfs_header_level(eb) == 0) @@ -1335,7 +1337,8 @@ again: if (IS_ERR(eb)) { ret = PTR_ERR(eb); goto out; - } else if (!extent_buffer_uptodate(eb)) { + } + if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); ret = -EIO; goto out; diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 1db24e6d6d90..c22d287e020b 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -124,7 +124,16 @@ void btrfs_put_block_group(struct btrfs_block_group *cache) { if (refcount_dec_and_test(&cache->refs)) { WARN_ON(cache->pinned > 0); - WARN_ON(cache->reserved > 0); + /* + * If there was a failure to cleanup a log tree, very likely due + * to an IO failure on a writeback attempt of one or more of its + * extent buffers, we could not do proper (and cheap) unaccounting + * of their reserved space, so don't warn on reserved > 0 in that + * case. + */ + if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) || + !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info)) + WARN_ON(cache->reserved > 0); /* * A block_group shouldn't be on the discard_list anymore. @@ -1513,8 +1522,12 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; - if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) + sb_start_write(fs_info->sb); + + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { + sb_end_write(fs_info->sb); return; + } /* * Long running balances can keep us blocked here for eternity, so @@ -1522,6 +1535,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) */ if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) { btrfs_exclop_finish(fs_info); + sb_end_write(fs_info->sb); return; } @@ -1596,6 +1610,7 @@ next: spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_exclop_finish(fs_info); + sb_end_write(fs_info->sb); } void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) @@ -1997,6 +2012,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, cache->length = key->offset; cache->used = btrfs_stack_block_group_used(bgi); cache->flags = btrfs_stack_block_group_flags(bgi); + cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi); set_free_space_tree_thresholds(cache); @@ -2279,7 +2295,7 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans, spin_lock(&block_group->lock); btrfs_set_stack_block_group_used(&bgi, block_group->used); btrfs_set_stack_block_group_chunk_objectid(&bgi, - BTRFS_FIRST_CHUNK_TREE_OBJECTID); + block_group->global_root_id); btrfs_set_stack_block_group_flags(&bgi, block_group->flags); key.objectid = block_group->start; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; @@ -2435,6 +2451,27 @@ next: btrfs_trans_release_chunk_metadata(trans); } +/* + * For extent tree v2 we use the block_group_item->chunk_offset to point at our + * global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID. + */ +static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset) +{ + u64 div = SZ_1G; + u64 index; + + if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + return BTRFS_FIRST_CHUNK_TREE_OBJECTID; + + /* If we have a smaller fs index based on 128MiB. */ + if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL)) + div = SZ_128M; + + offset = div64_u64(offset, div); + div64_u64_rem(offset, fs_info->nr_global_roots, &index); + return index; +} + struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, u64 type, u64 chunk_offset, u64 size) @@ -2455,6 +2492,8 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran cache->flags = type; cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; + cache->global_root_id = calculate_global_root_id(fs_info, cache->start); + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) cache->needs_free_space = 1; @@ -2544,6 +2583,19 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, int ret; bool dirty_bg_running; + /* + * This can only happen when we are doing read-only scrub on read-only + * mount. + * In that case we should not start a new transaction on read-only fs. + * Thus here we skip all chunk allocations. + */ + if (sb_rdonly(fs_info->sb)) { + mutex_lock(&fs_info->ro_block_group_mutex); + ret = inc_block_group_ro(cache, 0); + mutex_unlock(&fs_info->ro_block_group_mutex); + return ret; + } + do { trans = btrfs_join_transaction(root); if (IS_ERR(trans)) @@ -2671,7 +2723,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, bi = btrfs_item_ptr_offset(leaf, path->slots[0]); btrfs_set_stack_block_group_used(&bgi, cache->used); btrfs_set_stack_block_group_chunk_objectid(&bgi, - BTRFS_FIRST_CHUNK_TREE_OBJECTID); + cache->global_root_id); btrfs_set_stack_block_group_flags(&bgi, cache->flags); write_extent_buffer(leaf, &bgi, bi, sizeof(bgi)); btrfs_mark_buffer_dirty(leaf); @@ -3974,9 +4026,22 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) * important and indicates a real bug if this happens. */ if (WARN_ON(space_info->bytes_pinned > 0 || - space_info->bytes_reserved > 0 || space_info->bytes_may_use > 0)) btrfs_dump_space_info(info, space_info, 0, 0); + + /* + * If there was a failure to cleanup a log tree, very likely due + * to an IO failure on a writeback attempt of one or more of its + * extent buffers, we could not do proper (and cheap) unaccounting + * of their reserved space, so don't warn on bytes_reserved > 0 in + * that case. + */ + if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) || + !BTRFS_FS_LOG_CLEANUP_ERROR(info)) { + if (WARN_ON(space_info->bytes_reserved > 0)) + btrfs_dump_space_info(info, space_info, 0, 0); + } + WARN_ON(space_info->reclaim_size > 0); list_del(&space_info->list); btrfs_sysfs_remove_space_info(space_info); diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 5878b7ce3b78..93aabc68bb6a 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -68,6 +68,7 @@ struct btrfs_block_group { u64 bytes_super; u64 flags; u64 cache_generation; + u64 global_root_id; /* * If the free space extent count exceeds this number, convert the block diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index b3e46aabc3d8..47e72d72f7d0 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -14,6 +14,13 @@ #include "delayed-inode.h" /* + * Since we search a directory based on f_pos (struct dir_context::pos) we have + * to start at 2 since '.' and '..' have f_pos of 0 and 1 respectively, so + * everybody else has to start at 2 (see btrfs_real_readdir() and dir_emit_dots()). + */ +#define BTRFS_DIR_START_INDEX 2 + +/* * ordered_data_close is set by truncate when a file that used * to have good data has been truncated to zero. When it is set * the btrfs file release call will add this inode to the @@ -173,8 +180,9 @@ struct btrfs_inode { u64 disk_i_size; /* - * if this is a directory then index_cnt is the counter for the index - * number for new files that are created + * If this is a directory then index_cnt is the counter for the index + * number for new files that are created. For an empty directory, this + * must be initialized to BTRFS_DIR_START_INDEX. */ u64 index_cnt; @@ -333,6 +341,36 @@ static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode) spin_unlock(&inode->lock); } +/* + * Should be called while holding the inode's VFS lock in exclusive mode or in a + * context where no one else can access the inode concurrently (during inode + * creation or when loading an inode from disk). + */ +static inline void btrfs_set_inode_full_sync(struct btrfs_inode *inode) +{ + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); + /* + * The inode may have been part of a reflink operation in the last + * transaction that modified it, and then a fsync has reset the + * last_reflink_trans to avoid subsequent fsyncs in the same + * transaction to do unnecessary work. So update last_reflink_trans + * to the last_trans value (we have to be pessimistic and assume a + * reflink happened). + * + * The ->last_trans is protected by the inode's spinlock and we can + * have a concurrent ordered extent completion update it. Also set + * last_reflink_trans to ->last_trans only if the former is less than + * the later, because we can be called in a context where + * last_reflink_trans was set to the current transaction generation + * while ->last_trans was not yet updated in the current transaction, + * and therefore has a lower value. + */ + spin_lock(&inode->lock); + if (inode->last_reflink_trans < inode->last_trans) + inode->last_reflink_trans = inode->last_trans; + spin_unlock(&inode->lock); +} + static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) { bool ret = false; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 7e9f90fa0388..abac86a75840 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -78,7 +78,6 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/mutex.h> -#include <linux/genhd.h> #include <linux/blkdev.h> #include <linux/mm.h> #include <linux/string.h> diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 71e5b2e9a1ba..be476f094300 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -219,7 +219,7 @@ static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *b bi_size += bvec->bv_len; if (bio->bi_status) - cb->errors = 1; + cb->status = bio->bi_status; ASSERT(bi_size && bi_size <= cb->compressed_len); last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits, @@ -234,7 +234,7 @@ static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *b return last_io; } -static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bio) +static void finish_compressed_bio_read(struct compressed_bio *cb) { unsigned int index; struct page *page; @@ -247,19 +247,18 @@ static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bi } /* Do io completion on the original bio */ - if (cb->errors) { - bio_io_error(cb->orig_bio); + if (cb->status != BLK_STS_OK) { + cb->orig_bio->bi_status = cb->status; + bio_endio(cb->orig_bio); } else { struct bio_vec *bvec; struct bvec_iter_all iter_all; - ASSERT(bio); - ASSERT(!bio->bi_status); /* * We have verified the checksum already, set page checked so * the end_io handlers know about it */ - ASSERT(!bio_flagged(bio, BIO_CLONED)); + ASSERT(!bio_flagged(cb->orig_bio, BIO_CLONED)); bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) { u64 bvec_start = page_offset(bvec->bv_page) + bvec->bv_offset; @@ -308,7 +307,7 @@ static void end_compressed_bio_read(struct bio *bio) * Some IO in this cb have failed, just skip checksum as there * is no way it could be correct. */ - if (cb->errors == 1) + if (cb->status != BLK_STS_OK) goto csum_failed; inode = cb->inode; @@ -324,8 +323,8 @@ static void end_compressed_bio_read(struct bio *bio) csum_failed: if (ret) - cb->errors = 1; - finish_compressed_bio_read(cb, bio); + cb->status = errno_to_blk_status(ret); + finish_compressed_bio_read(cb); out: bio_put(bio); } @@ -342,11 +341,12 @@ static noinline void end_compressed_writeback(struct inode *inode, unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; struct page *pages[16]; unsigned long nr_pages = end_index - index + 1; + const int errno = blk_status_to_errno(cb->status); int i; int ret; - if (cb->errors) - mapping_set_error(inode->i_mapping, -EIO); + if (errno) + mapping_set_error(inode->i_mapping, errno); while (nr_pages > 0) { ret = find_get_pages_contig(inode->i_mapping, index, @@ -358,7 +358,7 @@ static noinline void end_compressed_writeback(struct inode *inode, continue; } for (i = 0; i < ret; i++) { - if (cb->errors) + if (errno) SetPageError(pages[i]); btrfs_page_clamp_clear_writeback(fs_info, pages[i], cb->start, cb->len); @@ -381,9 +381,10 @@ static void finish_compressed_bio_write(struct compressed_bio *cb) */ btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL, cb->start, cb->start + cb->len - 1, - !cb->errors); + cb->status == BLK_STS_OK); - end_compressed_writeback(inode, cb); + if (cb->writeback) + end_compressed_writeback(inode, cb); /* Note, our inode could be gone now */ /* @@ -506,7 +507,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, struct page **compressed_pages, unsigned int nr_pages, unsigned int write_flags, - struct cgroup_subsys_state *blkcg_css) + struct cgroup_subsys_state *blkcg_css, + bool writeback) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio = NULL; @@ -524,13 +526,14 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, if (!cb) return BLK_STS_RESOURCE; refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits); - cb->errors = 0; + cb->status = BLK_STS_OK; cb->inode = &inode->vfs_inode; cb->start = start; cb->len = len; cb->mirror_num = 0; cb->compressed_pages = compressed_pages; cb->compressed_len = compressed_len; + cb->writeback = writeback; cb->orig_bio = NULL; cb->nr_pages = nr_pages; @@ -591,7 +594,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, if (submit) { if (!skip_sum) { - ret = btrfs_csum_one_bio(inode, bio, start, 1); + ret = btrfs_csum_one_bio(inode, bio, start, true); if (ret) goto finish_cb; } @@ -808,7 +811,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, u64 em_len; u64 em_start; struct extent_map *em; - blk_status_t ret = BLK_STS_RESOURCE; + blk_status_t ret; int faili = 0; u8 *sums; @@ -821,17 +824,21 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize); read_unlock(&em_tree->lock); - if (!em) - return BLK_STS_IOERR; + if (!em) { + ret = BLK_STS_IOERR; + goto out; + } ASSERT(em->compress_type != BTRFS_COMPRESS_NONE); compressed_len = em->block_len; cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); - if (!cb) + if (!cb) { + ret = BLK_STS_RESOURCE; goto out; + } refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits); - cb->errors = 0; + cb->status = BLK_STS_OK; cb->inode = inode; cb->mirror_num = mirror_num; sums = cb->sums; @@ -851,8 +858,10 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE); cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); - if (!cb->compressed_pages) + if (!cb->compressed_pages) { + ret = BLK_STS_RESOURCE; goto fail1; + } for (pg_index = 0; pg_index < nr_pages; pg_index++) { cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS); @@ -938,7 +947,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, comp_bio = NULL; } } - return 0; + return BLK_STS_OK; fail2: while (faili >= 0) { @@ -951,6 +960,8 @@ fail1: kfree(cb); out: free_extent_map(em); + bio->bi_status = ret; + bio_endio(bio); return ret; finish_cb: if (comp_bio) { @@ -970,7 +981,7 @@ finish_cb: */ ASSERT(refcount_read(&cb->pending_sectors)); /* Now we are the only one referring @cb, can finish it safely. */ - finish_compressed_bio_read(cb, NULL); + finish_compressed_bio_read(cb); return ret; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 56eef0821e3e..ac5b20731d2a 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -22,6 +22,8 @@ struct btrfs_inode; /* Maximum length of compressed data stored on disk */ #define BTRFS_MAX_COMPRESSED (SZ_128K) +static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); + /* Maximum size of data before compression */ #define BTRFS_MAX_UNCOMPRESSED (SZ_128K) @@ -52,8 +54,11 @@ struct compressed_bio { /* The compression algorithm for this bio */ u8 compress_type; + /* Whether this is a write for writeback. */ + bool writeback; + /* IO errors */ - u8 errors; + blk_status_t status; int mirror_num; /* for reads, this is the bio we are copying the data into */ @@ -95,7 +100,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, struct page **compressed_pages, unsigned int nr_pages, unsigned int write_flags, - struct cgroup_subsys_state *blkcg_css); + struct cgroup_subsys_state *blkcg_css, + bool writeback); blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a7db3f6f1b7b..0eecf98d0abb 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -846,9 +846,11 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, btrfs_header_owner(parent), btrfs_node_ptr_generation(parent, slot), level - 1, &first_key); - if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) + return eb; + if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); - eb = ERR_PTR(-EIO); + return ERR_PTR(-EIO); } return eb; @@ -1436,13 +1438,13 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, /* now we're allowed to do a blocking uptodate check */ ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key); - if (!ret) { - *eb_ret = tmp; - return 0; + if (ret) { + free_extent_buffer(tmp); + btrfs_release_path(p); + return -EIO; } - free_extent_buffer(tmp); - btrfs_release_path(p); - return -EIO; + *eb_ret = tmp; + return 0; } /* @@ -1460,19 +1462,19 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, ret = -EAGAIN; tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid, gen, parent_level - 1, &first_key); - if (!IS_ERR(tmp)) { - /* - * If the read above didn't mark this buffer up to date, - * it will never end up being up to date. Set ret to EIO now - * and give up so that our caller doesn't loop forever - * on our EAGAINs. - */ - if (!extent_buffer_uptodate(tmp)) - ret = -EIO; - free_extent_buffer(tmp); - } else { - ret = PTR_ERR(tmp); + if (IS_ERR(tmp)) { + btrfs_release_path(p); + return PTR_ERR(tmp); } + /* + * If the read above didn't mark this buffer up to date, + * it will never end up being up to date. Set ret to EIO now + * and give up so that our caller doesn't loop forever + * on our EAGAINs. + */ + if (!extent_buffer_uptodate(tmp)) + ret = -EIO; + free_extent_buffer(tmp); btrfs_release_path(p); return ret; @@ -2990,16 +2992,11 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (free_space < data_size) goto out_unlock; - /* cow and double check */ ret = btrfs_cow_block(trans, root, right, upper, slot + 1, &right, BTRFS_NESTING_RIGHT_COW); if (ret) goto out_unlock; - free_space = btrfs_leaf_free_space(right); - if (free_space < data_size) - goto out_unlock; - left_nritems = btrfs_header_nritems(left); if (left_nritems == 0) goto out_unlock; @@ -3224,7 +3221,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } - /* cow and double check */ ret = btrfs_cow_block(trans, root, left, path->nodes[1], slot - 1, &left, BTRFS_NESTING_LEFT_COW); @@ -3235,12 +3231,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } - free_space = btrfs_leaf_free_space(left); - if (free_space < data_size) { - ret = 1; - goto out; - } - if (check_sibling_keys(left, right)) { ret = -EUCLEAN; goto out; @@ -4170,24 +4160,22 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; - u32 last_off; - u32 dsize = 0; int ret = 0; int wret; - int i; u32 nritems; leaf = path->nodes[0]; - last_off = btrfs_item_offset(leaf, slot + nr - 1); - - for (i = 0; i < nr; i++) - dsize += btrfs_item_size(leaf, slot + i); - nritems = btrfs_header_nritems(leaf); if (slot + nr != nritems) { - int data_end = leaf_data_end(leaf); + const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1); + const int data_end = leaf_data_end(leaf); struct btrfs_map_token token; + u32 dsize = 0; + int i; + + for (i = 0; i < nr; i++) + dsize += btrfs_item_size(leaf, slot + i); memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + data_end + dsize, @@ -4227,24 +4215,50 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, fixup_low_keys(path, &disk_key, 1); } - /* delete the leaf if it is mostly empty */ + /* + * Try to delete the leaf if it is mostly empty. We do this by + * trying to move all its items into its left and right neighbours. + * If we can't move all the items, then we don't delete it - it's + * not ideal, but future insertions might fill the leaf with more + * items, or items from other leaves might be moved later into our + * leaf due to deletions on those leaves. + */ if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) { + u32 min_push_space; + /* push_leaf_left fixes the path. * make sure the path still points to our leaf * for possible call to del_ptr below */ slot = path->slots[1]; atomic_inc(&leaf->refs); - - wret = push_leaf_left(trans, root, path, 1, 1, - 1, (u32)-1); + /* + * We want to be able to at least push one item to the + * left neighbour leaf, and that's the first item. + */ + min_push_space = sizeof(struct btrfs_item) + + btrfs_item_size(leaf, 0); + wret = push_leaf_left(trans, root, path, 0, + min_push_space, 1, (u32)-1); if (wret < 0 && wret != -ENOSPC) ret = wret; if (path->nodes[0] == leaf && btrfs_header_nritems(leaf)) { - wret = push_leaf_right(trans, root, path, 1, - 1, 1, 0); + /* + * If we were not able to push all items from our + * leaf to its left neighbour, then attempt to + * either push all the remaining items to the + * right neighbour or none. There's no advantage + * in pushing only some items, instead of all, as + * it's pointless to end up with a leaf having + * too few items while the neighbours can be full + * or nearly full. + */ + nritems = btrfs_header_nritems(leaf); + min_push_space = leaf_space_used(leaf, 0, nritems); + wret = push_leaf_right(trans, root, path, 0, + min_push_space, 1, 0); if (wret < 0 && wret != -ENOSPC) ret = wret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b4a9b1c58d22..b7631b88426e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -49,6 +49,7 @@ extern struct kmem_cache *btrfs_free_space_bitmap_cachep; struct btrfs_ordered_sum; struct btrfs_ref; struct btrfs_bio; +struct btrfs_ioctl_encoded_io_args; #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ @@ -145,6 +146,11 @@ enum { BTRFS_FS_STATE_DUMMY_FS_INFO, BTRFS_FS_STATE_NO_CSUMS, + + /* Indicates there was an error cleaning up a log tree. */ + BTRFS_FS_STATE_LOG_CLEANUP_ERROR, + + BTRFS_FS_STATE_COUNT }; #define BTRFS_BACKREF_REV_MAX 256 @@ -271,8 +277,14 @@ struct btrfs_super_block { /* the UUID written into btree blocks */ u8 metadata_uuid[BTRFS_FSID_SIZE]; + /* Extent tree v2 */ + __le64 block_group_root; + __le64 block_group_root_generation; + u8 block_group_root_level; + /* future expansion */ - __le64 reserved[28]; + u8 reserved8[7]; + __le64 reserved[25]; u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; @@ -297,6 +309,26 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL +#ifdef CONFIG_BTRFS_DEBUG +/* + * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG + */ +#define BTRFS_FEATURE_INCOMPAT_SUPP \ + (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ + BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ + BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ + BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ + BTRFS_FEATURE_INCOMPAT_RAID56 | \ + BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ + BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ + BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ + BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ + BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ + BTRFS_FEATURE_INCOMPAT_ZONED | \ + BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) +#else #define BTRFS_FEATURE_INCOMPAT_SUPP \ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ @@ -311,6 +343,7 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ BTRFS_FEATURE_INCOMPAT_ZONED) +#endif #define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) @@ -599,6 +632,9 @@ enum { /* Indicate that we want the transaction kthread to commit right now. */ BTRFS_FS_COMMIT_TRANS, + /* Indicate we have half completed snapshot deletions pending. */ + BTRFS_FS_UNFINISHED_DROPS, + #if BITS_PER_LONG == 32 /* Indicate if we have error/warn message printed on 32bit systems */ BTRFS_FS_32BIT_ERROR, @@ -630,6 +666,7 @@ struct btrfs_fs_info { struct btrfs_root *quota_root; struct btrfs_root *uuid_root; struct btrfs_root *data_reloc_root; + struct btrfs_root *block_group_root; /* the log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; @@ -1024,6 +1061,8 @@ struct btrfs_fs_info { spinlock_t relocation_bg_lock; u64 data_reloc_bg; + u64 nr_global_roots; + spinlock_t zone_active_bgs_lock; struct list_head zone_active_bgs; @@ -1103,8 +1142,15 @@ enum { BTRFS_ROOT_QGROUP_FLUSHING, /* We started the orphan cleanup for this root. */ BTRFS_ROOT_ORPHAN_CLEANUP, + /* This root has a drop operation that was started previously. */ + BTRFS_ROOT_UNFINISHED_DROP, }; +static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) +{ + clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); +} + /* * Record swapped tree blocks of a subvolume tree for delayed subtree trace * code. For detail check comment in fs/btrfs/qgroup.c. @@ -1596,25 +1642,25 @@ DECLARE_BTRFS_SETGET_BITS(64) static inline u##bits btrfs_##name(const struct extent_buffer *eb, \ const type *s) \ { \ - BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ return btrfs_get_##bits(eb, s, offsetof(type, member)); \ } \ static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \ u##bits val) \ { \ - BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ btrfs_set_##bits(eb, s, offsetof(type, member), val); \ } \ static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \ const type *s) \ { \ - BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ return btrfs_get_token_##bits(token, s, offsetof(type, member));\ } \ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ type *s, u##bits val) \ { \ - BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ btrfs_set_token_##bits(token, s, offsetof(type, member), val); \ } @@ -1645,8 +1691,8 @@ static inline void btrfs_set_##name(type *s, u##bits val) \ static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s) { - BUILD_BUG_ON(sizeof(u64) != - sizeof(((struct btrfs_dev_item *)0))->total_bytes); + static_assert(sizeof(u64) == + sizeof(((struct btrfs_dev_item *)0))->total_bytes); return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes)); } @@ -1654,8 +1700,8 @@ static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s, u64 val) { - BUILD_BUG_ON(sizeof(u64) != - sizeof(((struct btrfs_dev_item *)0))->total_bytes); + static_assert(sizeof(u64) == + sizeof(((struct btrfs_dev_item *)0))->total_bytes); WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize)); btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val); } @@ -2315,6 +2361,17 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, num_devices, 64); +/* + * For extent tree v2 we overload the extent root with the block group root, as + * we will have multiple extent roots. + */ +BTRFS_SETGET_STACK_FUNCS(backup_block_group_root, struct btrfs_root_backup, + extent_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_gen, struct btrfs_root_backup, + extent_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_level, + struct btrfs_root_backup, extent_root_level, 8); + /* struct btrfs_balance_item */ BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64); @@ -2449,6 +2506,13 @@ BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block, BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, uuid_tree_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_block_group_root, struct btrfs_super_block, + block_group_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_block_group_root_generation, + struct btrfs_super_block, + block_group_root_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_block_group_root_level, struct btrfs_super_block, + block_group_root_level, 8); int btrfs_super_csum_size(const struct btrfs_super_block *s); const char *btrfs_super_csum_name(u16 csum_type); @@ -2826,7 +2890,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv); void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); -int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); +int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, + u64 disk_num_bytes); u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); @@ -3142,7 +3207,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, - u64 file_start, int contig); + u64 offset, bool one_ordered); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, @@ -3243,6 +3308,11 @@ int btrfs_writepage_cow_fixup(struct page *page); void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, struct page *page, u64 start, u64 end, bool uptodate); +ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, + struct btrfs_ioctl_encoded_io_args *encoded); +ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded); + extern const struct dentry_operations btrfs_dentry_operations; extern const struct iomap_ops btrfs_dio_iomap_ops; extern const struct iomap_dio_ops btrfs_dio_ops; @@ -3288,7 +3358,7 @@ void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, int __init btrfs_auto_defrag_init(void); void __cold btrfs_auto_defrag_exit(void); int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode); + struct btrfs_inode *inode, u32 extent_thresh); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); @@ -3305,6 +3375,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, struct btrfs_trans_handle **trans_out); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 start, u64 end); +ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded); int btrfs_release_file(struct inode *inode, struct file *file); int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, @@ -3593,6 +3665,9 @@ do { \ #define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \ &(fs_info)->fs_state))) +#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \ + (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ + &(fs_info)->fs_state))) __printf(5, 6) __cold @@ -3758,7 +3833,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_recover_relocation(struct btrfs_root *root); +int btrfs_recover_relocation(struct btrfs_fs_info *fs_info); int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len); int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, @@ -3870,5 +3945,8 @@ static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) #define PageOrdered(page) PagePrivate2(page) #define SetPageOrdered(page) SetPagePrivate2(page) #define ClearPageOrdered(page) ClearPagePrivate2(page) +#define folio_test_ordered(folio) folio_test_private_2(folio) +#define folio_set_ordered(folio) folio_set_private_2(folio) +#define folio_clear_ordered(folio) folio_clear_private_2(folio) #endif diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index fb46a28f5065..bd8267c4687d 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -270,11 +270,11 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, } static void calc_inode_reservations(struct btrfs_fs_info *fs_info, - u64 num_bytes, u64 *meta_reserve, - u64 *qgroup_reserve) + u64 num_bytes, u64 disk_num_bytes, + u64 *meta_reserve, u64 *qgroup_reserve) { u64 nr_extents = count_max_extents(num_bytes); - u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes); + u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes); u64 inode_update = btrfs_calc_metadata_size(fs_info, 1); *meta_reserve = btrfs_calc_insert_metadata_size(fs_info, @@ -288,7 +288,8 @@ static void calc_inode_reservations(struct btrfs_fs_info *fs_info, *qgroup_reserve = nr_extents * fs_info->nodesize; } -int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) +int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, + u64 disk_num_bytes) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -318,6 +319,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) } num_bytes = ALIGN(num_bytes, fs_info->sectorsize); + disk_num_bytes = ALIGN(disk_num_bytes, fs_info->sectorsize); /* * We always want to do it this way, every other way is wrong and ends @@ -329,8 +331,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) * everything out and try again, which is bad. This way we just * over-reserve slightly, and clean up the mess when we are done. */ - calc_inode_reservations(fs_info, num_bytes, &meta_reserve, - &qgroup_reserve); + calc_inode_reservations(fs_info, num_bytes, disk_num_bytes, + &meta_reserve, &qgroup_reserve); ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true); if (ret) return ret; @@ -349,7 +351,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) spin_lock(&inode->lock); nr_extents = count_max_extents(num_bytes); btrfs_mod_outstanding_extents(inode, nr_extents); - inode->csum_bytes += num_bytes; + inode->csum_bytes += disk_num_bytes; btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); @@ -454,7 +456,7 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, ret = btrfs_check_data_free_space(inode, reserved, start, len); if (ret < 0) return ret; - ret = btrfs_delalloc_reserve_metadata(inode, len); + ret = btrfs_delalloc_reserve_metadata(inode, len, len); if (ret < 0) { btrfs_free_reserved_data_space(inode, *reserved, start, len); extent_changeset_free(*reserved); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 62b9651ea662..71fd99b48283 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -243,6 +243,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, struct btrfs_device *srcdev, struct btrfs_device **device_out) { + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; struct block_device *bdev; struct rcu_string *name; @@ -271,7 +272,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, sync_blockdev(bdev); - list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { + list_for_each_entry(device, &fs_devices->devices, dev_list) { if (device->bdev == bdev) { btrfs_err(fs_info, "target device is in the filesystem!"); @@ -302,6 +303,9 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, goto error; } rcu_assign_pointer(device->name, name); + ret = lookup_bdev(device_path, &device->devt); + if (ret) + goto error; set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); device->generation = 0; @@ -320,17 +324,17 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, device->mode = FMODE_EXCL; device->dev_stats_valid = 1; set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); - device->fs_devices = fs_info->fs_devices; + device->fs_devices = fs_devices; ret = btrfs_get_dev_zone_info(device, false); if (ret) goto error; - mutex_lock(&fs_info->fs_devices->device_list_mutex); - list_add(&device->dev_list, &fs_info->fs_devices->devices); - fs_info->fs_devices->num_devices++; - fs_info->fs_devices->open_devices++; - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_lock(&fs_devices->device_list_mutex); + list_add(&device->dev_list, &fs_devices->devices); + fs_devices->num_devices++; + fs_devices->open_devices++; + mutex_unlock(&fs_devices->device_list_mutex); *device_out = device; return 0; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 87a5addbedf6..b30309f187cf 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -441,17 +441,31 @@ static int csum_one_extent_buffer(struct extent_buffer *eb) else ret = btrfs_check_leaf_full(eb); - if (ret < 0) { - btrfs_print_tree(eb, 0); + if (ret < 0) + goto error; + + /* + * Also check the generation, the eb reached here must be newer than + * last committed. Or something seriously wrong happened. + */ + if (unlikely(btrfs_header_generation(eb) <= fs_info->last_trans_committed)) { + ret = -EUCLEAN; btrfs_err(fs_info, - "block=%llu write time tree block corruption detected", - eb->start); - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); - return ret; + "block=%llu bad generation, have %llu expect > %llu", + eb->start, btrfs_header_generation(eb), + fs_info->last_trans_committed); + goto error; } write_extent_buffer(eb, result, 0, fs_info->csum_size); return 0; + +error: + btrfs_print_tree(eb, 0); + btrfs_err(fs_info, "block=%llu write time tree block corruption detected", + eb->start); + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + return ret; } /* Checksum all dirty extent buffers in one bio_vec */ @@ -999,41 +1013,40 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags) return try_release_extent_buffer(page); } -static void btree_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) +static void btree_invalidate_folio(struct folio *folio, size_t offset, + size_t length) { struct extent_io_tree *tree; - tree = &BTRFS_I(page->mapping->host)->io_tree; - extent_invalidatepage(tree, page, offset); - btree_releasepage(page, GFP_NOFS); - if (PagePrivate(page)) { - btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info, - "page private not zero on page %llu", - (unsigned long long)page_offset(page)); - detach_page_private(page); + tree = &BTRFS_I(folio->mapping->host)->io_tree; + extent_invalidate_folio(tree, folio, offset); + btree_releasepage(&folio->page, GFP_NOFS); + if (folio_get_private(folio)) { + btrfs_warn(BTRFS_I(folio->mapping->host)->root->fs_info, + "folio private not zero on folio %llu", + (unsigned long long)folio_pos(folio)); + folio_detach_private(folio); } } -static int btree_set_page_dirty(struct page *page) -{ #ifdef DEBUG - struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); +static bool btree_dirty_folio(struct address_space *mapping, + struct folio *folio) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); struct btrfs_subpage *subpage; struct extent_buffer *eb; int cur_bit = 0; - u64 page_start = page_offset(page); + u64 page_start = folio_pos(folio); if (fs_info->sectorsize == PAGE_SIZE) { - BUG_ON(!PagePrivate(page)); - eb = (struct extent_buffer *)page->private; + eb = folio_get_private(folio); BUG_ON(!eb); BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); BUG_ON(!atomic_read(&eb->refs)); btrfs_assert_tree_write_locked(eb); - return __set_page_dirty_nobuffers(page); + return filemap_dirty_folio(mapping, folio); } - ASSERT(PagePrivate(page) && page->private); - subpage = (struct btrfs_subpage *)page->private; + subpage = folio_get_private(folio); ASSERT(subpage->dirty_bitmap); while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) { @@ -1059,18 +1072,20 @@ static int btree_set_page_dirty(struct page *page) cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits); } -#endif - return __set_page_dirty_nobuffers(page); + return filemap_dirty_folio(mapping, folio); } +#else +#define btree_dirty_folio filemap_dirty_folio +#endif static const struct address_space_operations btree_aops = { .writepages = btree_writepages, .releasepage = btree_releasepage, - .invalidatepage = btree_invalidatepage, + .invalidate_folio = btree_invalidate_folio, #ifdef CONFIG_MIGRATION .migratepage = btree_migratepage, #endif - .set_page_dirty = btree_set_page_dirty, + .dirty_folio = btree_dirty_folio, }; struct extent_buffer *btrfs_find_create_tree_block( @@ -1289,12 +1304,33 @@ struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info, return root; } +static u64 btrfs_global_root_id(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_block_group *block_group; + u64 ret; + + if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + return 0; + + if (bytenr) + block_group = btrfs_lookup_block_group(fs_info, bytenr); + else + block_group = btrfs_lookup_first_block_group(fs_info, bytenr); + ASSERT(block_group); + if (!block_group) + return 0; + ret = block_group->global_root_id; + btrfs_put_block_group(block_group); + + return ret; +} + struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr) { struct btrfs_key key = { .objectid = BTRFS_CSUM_TREE_OBJECTID, .type = BTRFS_ROOT_ITEM_KEY, - .offset = 0, + .offset = btrfs_global_root_id(fs_info, bytenr), }; return btrfs_global_root(fs_info, &key); @@ -1305,7 +1341,7 @@ struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr) struct btrfs_key key = { .objectid = BTRFS_EXTENT_TREE_OBJECTID, .type = BTRFS_ROOT_ITEM_KEY, - .offset = 0, + .offset = btrfs_global_root_id(fs_info, bytenr), }; return btrfs_global_root(fs_info, &key); @@ -1522,7 +1558,8 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, ret = PTR_ERR(root->node); root->node = NULL; goto fail; - } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) { + } + if (!btrfs_buffer_uptodate(root->node, generation, 0)) { ret = -EIO; goto fail; } @@ -1727,6 +1764,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_put_root(fs_info->uuid_root); btrfs_put_root(fs_info->fs_root); btrfs_put_root(fs_info->data_reloc_root); + btrfs_put_root(fs_info->block_group_root); btrfs_check_leaked_roots(fs_info); btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->super_copy); @@ -1925,8 +1963,7 @@ static void end_workqueue_fn(struct btrfs_work *work) static int cleaner_kthread(void *arg) { - struct btrfs_root *root = arg; - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)arg; int again; while (1) { @@ -1959,7 +1996,7 @@ static int cleaner_kthread(void *arg) btrfs_run_delayed_iputs(fs_info); - again = btrfs_clean_one_deleted_snapshot(root); + again = btrfs_clean_one_deleted_snapshot(fs_info); mutex_unlock(&fs_info->cleaner_mutex); /* @@ -2095,8 +2132,6 @@ static void backup_super_roots(struct btrfs_fs_info *info) { const int next_backup = info->backup_root_index; struct btrfs_root_backup *root_backup; - struct btrfs_root *extent_root = btrfs_extent_root(info, 0); - struct btrfs_root *csum_root = btrfs_csum_root(info, 0); root_backup = info->super_for_commit->super_roots + next_backup; @@ -2121,11 +2156,30 @@ static void backup_super_roots(struct btrfs_fs_info *info) btrfs_set_backup_chunk_root_level(root_backup, btrfs_header_level(info->chunk_root->node)); - btrfs_set_backup_extent_root(root_backup, extent_root->node->start); - btrfs_set_backup_extent_root_gen(root_backup, - btrfs_header_generation(extent_root->node)); - btrfs_set_backup_extent_root_level(root_backup, - btrfs_header_level(extent_root->node)); + if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) { + btrfs_set_backup_block_group_root(root_backup, + info->block_group_root->node->start); + btrfs_set_backup_block_group_root_gen(root_backup, + btrfs_header_generation(info->block_group_root->node)); + btrfs_set_backup_block_group_root_level(root_backup, + btrfs_header_level(info->block_group_root->node)); + } else { + struct btrfs_root *extent_root = btrfs_extent_root(info, 0); + struct btrfs_root *csum_root = btrfs_csum_root(info, 0); + + btrfs_set_backup_extent_root(root_backup, + extent_root->node->start); + btrfs_set_backup_extent_root_gen(root_backup, + btrfs_header_generation(extent_root->node)); + btrfs_set_backup_extent_root_level(root_backup, + btrfs_header_level(extent_root->node)); + + btrfs_set_backup_csum_root(root_backup, csum_root->node->start); + btrfs_set_backup_csum_root_gen(root_backup, + btrfs_header_generation(csum_root->node)); + btrfs_set_backup_csum_root_level(root_backup, + btrfs_header_level(csum_root->node)); + } /* * we might commit during log recovery, which happens before we set @@ -2146,12 +2200,6 @@ static void backup_super_roots(struct btrfs_fs_info *info) btrfs_set_backup_dev_root_level(root_backup, btrfs_header_level(info->dev_root->node)); - btrfs_set_backup_csum_root(root_backup, csum_root->node->start); - btrfs_set_backup_csum_root_gen(root_backup, - btrfs_header_generation(csum_root->node)); - btrfs_set_backup_csum_root_level(root_backup, - btrfs_header_level(csum_root->node)); - btrfs_set_backup_total_bytes(root_backup, btrfs_super_total_bytes(info->super_copy)); btrfs_set_backup_bytes_used(root_backup, @@ -2269,6 +2317,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root) free_root_extent_buffers(info->uuid_root); free_root_extent_buffers(info->fs_root); free_root_extent_buffers(info->data_reloc_root); + free_root_extent_buffers(info->block_group_root); if (free_chunk_root) free_root_extent_buffers(info->chunk_root); } @@ -2504,11 +2553,13 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, log_tree_root->node = NULL; btrfs_put_root(log_tree_root); return ret; - } else if (!extent_buffer_uptodate(log_tree_root->node)) { + } + if (!extent_buffer_uptodate(log_tree_root->node)) { btrfs_err(fs_info, "failed to read log tree"); btrfs_put_root(log_tree_root); return -EIO; } + /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); if (ret) { @@ -2533,6 +2584,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, { struct btrfs_fs_info *fs_info = tree_root->fs_info; struct btrfs_root *root; + u64 max_global_id = 0; int ret; struct btrfs_key key = { .objectid = objectid, @@ -2568,6 +2620,13 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, break; btrfs_release_path(path); + /* + * Just worry about this for extent tree, it'll be the same for + * everybody. + */ + if (objectid == BTRFS_EXTENT_TREE_OBJECTID) + max_global_id = max(max_global_id, key.offset); + found = true; root = read_tree_root_path(tree_root, path, &key); if (IS_ERR(root)) { @@ -2585,6 +2644,9 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, } btrfs_release_path(path); + if (objectid == BTRFS_EXTENT_TREE_OBJECTID) + fs_info->nr_global_roots = max_global_id + 1; + if (!found || ret) { if (objectid == BTRFS_CSUM_TREE_OBJECTID) set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); @@ -2930,6 +2992,56 @@ out: return ret; } +static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level) +{ + int ret = 0; + + root->node = read_tree_block(root->fs_info, bytenr, + root->root_key.objectid, gen, level, NULL); + if (IS_ERR(root->node)) { + ret = PTR_ERR(root->node); + root->node = NULL; + return ret; + } + if (!extent_buffer_uptodate(root->node)) { + free_extent_buffer(root->node); + root->node = NULL; + return -EIO; + } + + btrfs_set_root_node(&root->root_item, root->node); + root->commit_root = btrfs_root_node(root); + btrfs_set_root_refs(&root->root_item, 1); + return ret; +} + +static int load_important_roots(struct btrfs_fs_info *fs_info) +{ + struct btrfs_super_block *sb = fs_info->super_copy; + u64 gen, bytenr; + int level, ret; + + bytenr = btrfs_super_root(sb); + gen = btrfs_super_generation(sb); + level = btrfs_super_root_level(sb); + ret = load_super_root(fs_info->tree_root, bytenr, gen, level); + if (ret) { + btrfs_warn(fs_info, "couldn't read tree root"); + return ret; + } + + if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + return 0; + + bytenr = btrfs_super_block_group_root(sb); + gen = btrfs_super_block_group_root_generation(sb); + level = btrfs_super_block_group_root_level(sb); + ret = load_super_root(fs_info->block_group_root, bytenr, gen, level); + if (ret) + btrfs_warn(fs_info, "couldn't read block group root"); + return ret; +} + static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) { int backup_index = find_newest_super_backup(fs_info); @@ -2939,10 +3051,17 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) int ret = 0; int i; - for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { - u64 generation; - int level; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + struct btrfs_root *root; + + root = btrfs_alloc_root(fs_info, BTRFS_BLOCK_GROUP_TREE_OBJECTID, + GFP_KERNEL); + if (!root) + return -ENOMEM; + fs_info->block_group_root = root; + } + for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { if (handle_error) { if (!IS_ERR(tree_root->node)) free_extent_buffer(tree_root->node); @@ -2967,29 +3086,13 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) if (ret < 0) return ret; } - generation = btrfs_super_generation(sb); - level = btrfs_super_root_level(sb); - tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb), - BTRFS_ROOT_TREE_OBJECTID, - generation, level, NULL); - if (IS_ERR(tree_root->node)) { - handle_error = true; - ret = PTR_ERR(tree_root->node); - tree_root->node = NULL; - btrfs_warn(fs_info, "couldn't read tree root"); - continue; - } else if (!extent_buffer_uptodate(tree_root->node)) { + ret = load_important_roots(fs_info); + if (ret) { handle_error = true; - ret = -EIO; - btrfs_warn(fs_info, "error while reading tree root"); continue; } - btrfs_set_root_node(&tree_root->root_item, tree_root->node); - tree_root->commit_root = btrfs_root_node(tree_root); - btrfs_set_root_refs(&tree_root->root_item, 1); - /* * No need to hold btrfs_root::objectid_mutex since the fs * hasn't been fully initialised and we are the only user @@ -3009,8 +3112,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) } /* All successful */ - fs_info->generation = generation; - fs_info->last_trans_committed = generation; + fs_info->generation = btrfs_header_generation(tree_root->node); + fs_info->last_trans_committed = fs_info->generation; fs_info->last_reloc_trans = 0; /* Always begin writing backup roots after the one being used */ @@ -3293,7 +3396,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) up_read(&fs_info->cleanup_work_sem); mutex_lock(&fs_info->cleaner_mutex); - ret = btrfs_recover_relocation(fs_info->tree_root); + ret = btrfs_recover_relocation(fs_info); mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { btrfs_warn(fs_info, "failed to recover relocation: %d", ret); @@ -3594,21 +3697,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device generation = btrfs_super_chunk_root_generation(disk_super); level = btrfs_super_chunk_root_level(disk_super); - - chunk_root->node = read_tree_block(fs_info, - btrfs_super_chunk_root(disk_super), - BTRFS_CHUNK_TREE_OBJECTID, - generation, level, NULL); - if (IS_ERR(chunk_root->node) || - !extent_buffer_uptodate(chunk_root->node)) { + ret = load_super_root(chunk_root, btrfs_super_chunk_root(disk_super), + generation, level); + if (ret) { btrfs_err(fs_info, "failed to read chunk root"); - if (!IS_ERR(chunk_root->node)) - free_extent_buffer(chunk_root->node); - chunk_root->node = NULL; goto fail_tree_roots; } - btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); - chunk_root->commit_root = btrfs_root_node(chunk_root); read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, offsetof(struct btrfs_header, chunk_tree_uuid), @@ -3728,7 +3822,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_sysfs; } - fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, + fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info, "btrfs-cleaner"); if (IS_ERR(fs_info->cleaner_kthread)) goto fail_sysfs; @@ -3813,6 +3907,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device set_bit(BTRFS_FS_OPEN, &fs_info->flags); + /* Kick the cleaner thread so it'll start deleting snapshots. */ + if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags)) + wake_up_process(fs_info->cleaner_kthread); + clear_oneshot: btrfs_clear_oneshot_options(fs_info); return 0; @@ -4029,8 +4127,9 @@ static int write_dev_supers(struct btrfs_device *device, * to do I/O, so we don't lose the ability to do integrity * checking. */ - bio = bio_alloc(GFP_NOFS, 1); - bio_set_dev(bio, device->bdev); + bio = bio_alloc(device->bdev, 1, + REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO, + GFP_NOFS); bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT; bio->bi_private = device; bio->bi_end_io = btrfs_end_super_write; @@ -4042,7 +4141,6 @@ static int write_dev_supers(struct btrfs_device *device, * go down lazy and there's a short window where the on-disk * copies might still contain the older version. */ - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO; if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER)) bio->bi_opf |= REQ_FUA; @@ -4154,10 +4252,8 @@ static void write_dev_flush(struct btrfs_device *device) return; #endif - bio_reset(bio); + bio_reset(bio, device->bdev, REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH); bio->bi_end_io = btrfs_end_empty_barrier; - bio_set_dev(bio, device->bdev); - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; init_completion(&device->flush_wait); bio->bi_private = &device->flush_wait; @@ -4538,6 +4634,12 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) */ kthread_park(fs_info->cleaner_kthread); + /* + * If we had UNFINISHED_DROPS we could still be processing them, so + * clear that bit and wake up relocation so it can stop. + */ + btrfs_wake_unfinished_drop(fs_info); + /* wait for the qgroup rescan worker to stop */ btrfs_qgroup_wait_for_completion(fs_info, false); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 5e8bef4b7563..2e10514ecda8 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -111,6 +111,8 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info) { + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + return fs_info->block_group_root; return btrfs_extent_root(fs_info, 0); } diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 04083ee5ae6e..c3eb52dbe61c 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -244,8 +244,8 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, u32 bits); int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, u32 bits); -int extent_invalidatepage(struct extent_io_tree *tree, - struct page *page, unsigned long offset); +int extent_invalidate_folio(struct extent_io_tree *tree, + struct folio *folio, size_t offset); bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, u64 *end, u64 max_bytes, struct extent_state **cached_state); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d89273c4b6b8..f477035a2ac2 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -598,7 +598,7 @@ fail: static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - int refs_to_drop, int *last_ref) + int refs_to_drop) { struct btrfs_key key; struct btrfs_extent_data_ref *ref1 = NULL; @@ -631,7 +631,6 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, if (num_refs == 0) { ret = btrfs_del_item(trans, root, path); - *last_ref = 1; } else { if (key.type == BTRFS_EXTENT_DATA_REF_KEY) btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); @@ -1072,8 +1071,7 @@ static noinline_for_stack void update_inline_extent_backref(struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_mod, - struct btrfs_delayed_extent_op *extent_op, - int *last_ref) + struct btrfs_delayed_extent_op *extent_op) { struct extent_buffer *leaf = path->nodes[0]; struct btrfs_extent_item *ei; @@ -1121,7 +1119,6 @@ void update_inline_extent_backref(struct btrfs_path *path, else btrfs_set_shared_data_ref_count(leaf, sref, refs); } else { - *last_ref = 1; size = btrfs_extent_inline_ref_size(type); item_size = btrfs_item_size(leaf, path->slots[0]); ptr = (unsigned long)iref; @@ -1166,8 +1163,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, } return -EUCLEAN; } - update_inline_extent_backref(path, iref, refs_to_add, - extent_op, NULL); + update_inline_extent_backref(path, iref, refs_to_add, extent_op); } else if (ret == -ENOENT) { setup_inline_extent_backref(trans->fs_info, path, iref, parent, root_objectid, owner, offset, @@ -1181,21 +1177,17 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, - int refs_to_drop, int is_data, int *last_ref) + int refs_to_drop, int is_data) { int ret = 0; BUG_ON(!is_data && refs_to_drop != 1); - if (iref) { - update_inline_extent_backref(path, iref, -refs_to_drop, NULL, - last_ref); - } else if (is_data) { - ret = remove_extent_data_ref(trans, root, path, refs_to_drop, - last_ref); - } else { - *last_ref = 1; + if (iref) + update_inline_extent_backref(path, iref, -refs_to_drop, NULL); + else if (is_data) + ret = remove_extent_data_ref(trans, root, path, refs_to_drop); + else ret = btrfs_del_item(trans, root, path); - } return ret; } @@ -2766,12 +2758,11 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, spin_unlock(&cache->lock); if (!readonly && return_free_space && global_rsv->space_info == space_info) { - u64 to_add = len; - spin_lock(&global_rsv->lock); if (!global_rsv->full) { - to_add = min(len, global_rsv->size - - global_rsv->reserved); + u64 to_add = min(len, global_rsv->size - + global_rsv->reserved); + global_rsv->reserved += to_add; btrfs_space_info_update_bytes_may_use(fs_info, space_info, to_add); @@ -2862,6 +2853,35 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) return 0; } +static int do_free_extent_accounting(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, bool is_data) +{ + int ret; + + if (is_data) { + struct btrfs_root *csum_root; + + csum_root = btrfs_csum_root(trans->fs_info, bytenr); + ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + } + + ret = add_to_free_space_tree(trans, bytenr, num_bytes); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + + ret = btrfs_update_block_group(trans, bytenr, num_bytes, false); + if (ret) + btrfs_abort_transaction(trans, ret); + + return ret; +} + /* * Drop one or more refs of @node. * @@ -2943,7 +2963,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, u64 refs; u64 bytenr = node->bytenr; u64 num_bytes = node->num_bytes; - int last_ref = 0; bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); extent_root = btrfs_extent_root(info, bytenr); @@ -3010,8 +3029,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } /* Must be SHARED_* item, remove the backref first */ ret = remove_extent_backref(trans, extent_root, path, - NULL, refs_to_drop, is_data, - &last_ref); + NULL, refs_to_drop, is_data); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -3136,8 +3154,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } if (found_extent) { ret = remove_extent_backref(trans, extent_root, path, - iref, refs_to_drop, is_data, - &last_ref); + iref, refs_to_drop, is_data); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -3182,7 +3199,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } - last_ref = 1; ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); if (ret) { @@ -3191,28 +3207,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - if (is_data) { - struct btrfs_root *csum_root; - csum_root = btrfs_csum_root(info, bytenr); - ret = btrfs_del_csums(trans, csum_root, bytenr, - num_bytes); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out; - } - } - - ret = add_to_free_space_tree(trans, bytenr, num_bytes); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out; - } - - ret = btrfs_update_block_group(trans, bytenr, num_bytes, false); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out; - } + ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data); } btrfs_release_path(path); @@ -4605,6 +4600,28 @@ int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, return ret; } +static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr, + u64 num_bytes) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + int ret; + + ret = remove_from_free_space_tree(trans, bytenr, num_bytes); + if (ret) + return ret; + + ret = btrfs_update_block_group(trans, bytenr, num_bytes, true); + if (ret) { + ASSERT(!ret); + btrfs_err(fs_info, "update block group failed for %llu %llu", + bytenr, num_bytes); + return ret; + } + + trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes); + return 0; +} + static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, u64 flags, u64 owner, u64 offset, @@ -4665,18 +4682,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset); - if (ret) - return ret; - - ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, true); - if (ret) { /* -ENOENT, logic error */ - btrfs_err(fs_info, "update block group failed for %llu %llu", - ins->objectid, ins->offset); - BUG(); - } - trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset); - return ret; + return alloc_reserved_extent(trans, ins->objectid, ins->offset); } static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, @@ -4694,7 +4700,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_delayed_tree_ref *ref; u32 size = sizeof(*extent_item) + sizeof(*iref); - u64 num_bytes; u64 flags = extent_op->flags_to_set; bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); @@ -4704,12 +4709,10 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, if (skinny_metadata) { extent_key.offset = ref->level; extent_key.type = BTRFS_METADATA_ITEM_KEY; - num_bytes = fs_info->nodesize; } else { extent_key.offset = node->num_bytes; extent_key.type = BTRFS_EXTENT_ITEM_KEY; size += sizeof(*block_info); - num_bytes = node->num_bytes; } path = btrfs_alloc_path(); @@ -4754,22 +4757,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - ret = remove_from_free_space_tree(trans, extent_key.objectid, - num_bytes); - if (ret) - return ret; - - ret = btrfs_update_block_group(trans, extent_key.objectid, - fs_info->nodesize, true); - if (ret) { /* -ENOENT, logic error */ - btrfs_err(fs_info, "update block group failed for %llu %llu", - extent_key.objectid, extent_key.offset); - BUG(); - } - - trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid, - fs_info->nodesize); - return ret; + return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize); } int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, @@ -5622,6 +5610,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) int ret; int level; bool root_dropped = false; + bool unfinished_drop = false; btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid); @@ -5664,6 +5653,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) * already dropped. */ set_bit(BTRFS_ROOT_DELETING, &root->state); + unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state); + if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { level = btrfs_header_level(root->node); path->nodes[level] = btrfs_lock_root_node(root); @@ -5839,6 +5830,13 @@ out_free: btrfs_free_path(path); out: /* + * We were an unfinished drop root, check to see if there are any + * pending, and if not clear and wake up any waiters. + */ + if (!err && unfinished_drop) + btrfs_maybe_wake_unfinished_drop(fs_info); + + /* * So if we need to stop dropping the snapshot for whatever reason we * need to make sure to add it back to the dead root list so that we * keep trying to do the work later. This also cleans up roots if we diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 409bad3928db..d78b3a2d04e3 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1507,17 +1507,17 @@ void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) { + struct address_space *mapping = inode->i_mapping; unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; - struct page *page; + struct folio *folio; while (index <= end_index) { - page = find_get_page(inode->i_mapping, index); - BUG_ON(!page); /* Pages should be in the extent_io_tree */ - __set_page_dirty_nobuffers(page); - account_page_redirty(page); - put_page(page); - index++; + folio = filemap_get_folio(mapping, index); + filemap_dirty_folio(mapping, folio); + folio_account_redirty(folio); + index += folio_nr_pages(folio); + folio_put(folio); } } @@ -2610,6 +2610,7 @@ static bool btrfs_check_repairable(struct inode *inode, * a good copy of the failed sector and if we succeed, we have setup * everything for repair_io_failure to do the rest for us. */ + ASSERT(failed_mirror); failrec->failed_mirror = failed_mirror; failrec->this_mirror++; if (failrec->this_mirror == failed_mirror) @@ -2639,7 +2640,6 @@ int btrfs_repair_one_sector(struct inode *inode, const int icsum = bio_offset >> fs_info->sectorsize_bits; struct bio *repair_bio; struct btrfs_bio *repair_bbio; - blk_status_t status; btrfs_debug(fs_info, "repair read error: read error at %llu", start); @@ -2678,13 +2678,13 @@ int btrfs_repair_one_sector(struct inode *inode, "repair read error: submitting new read to mirror %d", failrec->this_mirror); - status = submit_bio_hook(inode, repair_bio, failrec->this_mirror, - failrec->bio_flags); - if (status) { - free_io_failure(failure_tree, tree, failrec); - bio_put(repair_bio); - } - return blk_status_to_errno(status); + /* + * At this point we have a bio, so any errors from submit_bio_hook() + * will be handled by the endio on the repair_bio, so we can't return an + * error here. + */ + submit_bio_hook(inode, repair_bio, failrec->this_mirror, failrec->bio_flags); + return BLK_STS_OK; } static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) @@ -3068,6 +3068,14 @@ static void end_bio_extent_readpage(struct bio *bio) if (is_data_inode(inode)) { /* + * If we failed to submit the IO at all we'll have a + * mirror_num == 0, in which case we need to just mark + * the page with an error and unlock it and carry on. + */ + if (mirror == 0) + goto readpage_ok; + + /* * btrfs_submit_read_repair() will handle all the good * and bad sectors, we just continue to the next bvec. */ @@ -3143,7 +3151,7 @@ struct bio *btrfs_bio_alloc(unsigned int nr_iovecs) struct bio *bio; ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS); - bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset); + bio = bio_alloc_bioset(NULL, nr_iovecs, 0, GFP_NOFS, &btrfs_bioset); btrfs_bio_init(btrfs_bio(bio)); return bio; } @@ -3154,7 +3162,7 @@ struct bio *btrfs_bio_clone(struct bio *bio) struct bio *new; /* Bio allocation backed by a bioset does not fail */ - new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset); + new = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOFS, &btrfs_bioset); bbio = btrfs_bio(new); btrfs_bio_init(bbio); bbio->iter = bio->bi_iter; @@ -3169,7 +3177,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) ASSERT(offset <= UINT_MAX && size <= UINT_MAX); /* this will never fail when it's backed by a bioset */ - bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset); + bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); ASSERT(bio); bbio = btrfs_bio(bio); @@ -3534,7 +3542,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, } em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len); - if (em_cached && !IS_ERR_OR_NULL(em)) { + if (em_cached && !IS_ERR(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); *em_cached = em; @@ -3563,7 +3571,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, u64 cur_end; struct extent_map *em; int ret = 0; - int nr = 0; size_t pg_offset = 0; size_t iosize; size_t blocksize = inode->i_sb->s_blocksize; @@ -3608,9 +3615,10 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, } em = __get_extent_map(inode, page, pg_offset, cur, end - cur + 1, em_cached); - if (IS_ERR_OR_NULL(em)) { + if (IS_ERR(em)) { unlock_extent(tree, cur, end); end_page_read(page, false, cur, end + 1 - cur); + ret = PTR_ERR(em); break; } extent_offset = cur - em->start; @@ -3721,9 +3729,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, end_bio_extent_readpage, 0, this_bio_flag, force_bio_submit); - if (!ret) { - nr++; - } else { + if (ret) { unlock_extent(tree, cur, cur + iosize - 1); end_page_read(page, false, cur, iosize); goto out; @@ -3951,7 +3957,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, } em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1); - if (IS_ERR_OR_NULL(em)) { + if (IS_ERR(em)) { btrfs_page_set_error(fs_info, page, cur, end - cur + 1); ret = PTR_ERR_OR_ZERO(em); break; @@ -4048,6 +4054,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, static int __extent_writepage(struct page *page, struct writeback_control *wbc, struct extent_page_data *epd) { + struct folio *folio = page_folio(page); struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u64 page_start = page_offset(page); @@ -4068,8 +4075,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, pg_offset = offset_in_page(i_size); if (page->index > end_index || (page->index == end_index && !pg_offset)) { - page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); - unlock_page(page); + folio_invalidate(folio, 0, folio_size(folio)); + folio_unlock(folio); return 0; } @@ -4780,11 +4787,12 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, return ret; } if (cache) { - /* Impiles write in zoned mode */ - btrfs_put_block_group(cache); - /* Mark the last eb in a block group */ + /* + * Implies write in zoned mode. Mark the last eb in a block group. + */ if (cache->seq_zone && eb->start + eb->len == cache->zone_capacity) set_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags); + btrfs_put_block_group(cache); } ret = write_one_eb(eb, wbc, epd); free_extent_buffer(eb); @@ -5218,17 +5226,17 @@ void extent_readahead(struct readahead_control *rac) } /* - * basic invalidatepage code, this waits on any locked or writeback - * ranges corresponding to the page, and then deletes any extent state + * basic invalidate_folio code, this waits on any locked or writeback + * ranges corresponding to the folio, and then deletes any extent state * records from the tree */ -int extent_invalidatepage(struct extent_io_tree *tree, - struct page *page, unsigned long offset) +int extent_invalidate_folio(struct extent_io_tree *tree, + struct folio *folio, size_t offset) { struct extent_state *cached_state = NULL; - u64 start = page_offset(page); - u64 end = start + PAGE_SIZE - 1; - size_t blocksize = page->mapping->host->i_sb->s_blocksize; + u64 start = folio_pos(folio); + u64 end = start + folio_size(folio) - 1; + size_t blocksize = folio->mapping->host->i_sb->s_blocksize; /* This function is only called for the btree inode */ ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO); @@ -5238,7 +5246,7 @@ int extent_invalidatepage(struct extent_io_tree *tree, return 0; lock_extent_bits(tree, start, end, &cached_state); - wait_on_page_writeback(page); + folio_wait_writeback(folio); /* * Currently for btree io tree, only EXTENT_LOCKED is utilized, @@ -5390,7 +5398,7 @@ static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode, break; len = ALIGN(len, sectorsize); em = btrfs_get_extent_fiemap(inode, offset, len); - if (IS_ERR_OR_NULL(em)) + if (IS_ERR(em)) return em; /* if this isn't a hole return it */ @@ -6841,14 +6849,24 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb, { struct btrfs_fs_info *fs_info = eb->fs_info; + /* + * If we are using the commit root we could potentially clear a page + * Uptodate while we're using the extent buffer that we've previously + * looked up. We don't want to complain in this case, as the page was + * valid before, we just didn't write it out. Instead we want to catch + * the case where we didn't actually read the block properly, which + * would have !PageUptodate && !PageError, as we clear PageError before + * reading. + */ if (fs_info->sectorsize < PAGE_SIZE) { - bool uptodate; + bool uptodate, error; uptodate = btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len); - WARN_ON(!uptodate); + error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len); + WARN_ON(!uptodate && !error); } else { - WARN_ON(!PageUptodate(page)); + WARN_ON(!PageUptodate(page) && !PageError(page)); } } diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 5a36add21305..6fee14ce2e6b 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -261,6 +261,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; em->mod_start = merge->mod_start; em->generation = max(em->generation, merge->generation); + set_bit(EXTENT_FLAG_MERGED, &em->flags); rb_erase_cached(&merge->rb_node, &tree->map); RB_CLEAR_NODE(&merge->rb_node); @@ -278,6 +279,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) RB_CLEAR_NODE(&merge->rb_node); em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; em->generation = max(em->generation, merge->generation); + set_bit(EXTENT_FLAG_MERGED, &em->flags); free_extent_map(merge); } } @@ -490,6 +492,8 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree, */ void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) { + lockdep_assert_held_write(&tree->lock); + WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); rb_erase_cached(&em->rb_node, &tree->map); if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) @@ -504,6 +508,8 @@ void replace_extent_mapping(struct extent_map_tree *tree, struct extent_map *new, int modified) { + lockdep_assert_held_write(&tree->lock); + WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags)); ASSERT(extent_map_in_tree(cur)); if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags)) diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 8e217337dff9..d2fa32ffe304 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -25,6 +25,8 @@ enum { EXTENT_FLAG_FILLING, /* filesystem extent mapping type */ EXTENT_FLAG_FS_MAPPING, + /* This em is merged from two or more physically adjacent ems */ + EXTENT_FLAG_MERGED, }; struct extent_map { @@ -40,6 +42,12 @@ struct extent_map { u64 ram_bytes; u64 block_start; u64 block_len; + + /* + * Generation of the extent map, for merged em it's the highest + * generation of all merged ems. + * For non-merged extents, it's from btrfs_file_extent_item::generation. + */ u64 generation; unsigned long flags; /* Used for chunk mappings, flag EXTENT_FLAG_FS_MAPPING must be set */ diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 90c5c38836ab..c828f971a346 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -305,7 +305,7 @@ found: read_extent_buffer(path->nodes[0], dst, (unsigned long)item, ret * csum_size); out: - if (ret == -ENOENT) + if (ret == -ENOENT || ret == -EFBIG) ret = 0; return ret; } @@ -368,6 +368,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_bio *bbio = NULL; struct btrfs_path *path; const u32 sectorsize = fs_info->sectorsize; const u32 csum_size = fs_info->csum_size; @@ -377,6 +378,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst u8 *csum; const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits; int count = 0; + blk_status_t ret = BLK_STS_OK; if ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) @@ -400,7 +402,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst return BLK_STS_RESOURCE; if (!dst) { - struct btrfs_bio *bbio = btrfs_bio(bio); + bbio = btrfs_bio(bio); if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); @@ -456,21 +458,27 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst count = search_csum_tree(fs_info, path, cur_disk_bytenr, search_len, csum_dst); - if (count <= 0) { - /* - * Either we hit a critical error or we didn't find - * the csum. - * Either way, we put zero into the csums dst, and skip - * to the next sector. - */ + if (count < 0) { + ret = errno_to_blk_status(count); + if (bbio) + btrfs_bio_free_csum(bbio); + break; + } + + /* + * We didn't find a csum for this range. We need to make sure + * we complain loudly about this, because we are not NODATASUM. + * + * However for the DATA_RELOC inode we could potentially be + * relocating data extents for a NODATASUM inode, so the inode + * itself won't be marked with NODATASUM, but the extent we're + * copying is in fact NODATASUM. If we don't find a csum we + * assume this is the case. + */ + if (count == 0) { memset(csum_dst, 0, csum_size); count = 1; - /* - * For data reloc inode, we need to mark the range - * NODATASUM so that balance won't report false csum - * error. - */ if (BTRFS_I(inode)->root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { u64 file_offset; @@ -491,7 +499,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst } btrfs_free_path(path); - return BLK_STS_OK; + return ret; } int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, @@ -612,32 +620,33 @@ fail: return ret; } -/* - * btrfs_csum_one_bio - Calculates checksums of the data contained inside a bio +/** + * Calculate checksums of the data contained inside a bio + * * @inode: Owner of the data inside the bio * @bio: Contains the data to be checksummed - * @file_start: offset in file this bio begins to describe - * @contig: Boolean. If true/1 means all bio vecs in this bio are - * contiguous and they begin at @file_start in the file. False/0 - * means this bio can contain potentially discontiguous bio vecs - * so the logical offset of each should be calculated separately. + * @offset: If (u64)-1, @bio may contain discontiguous bio vecs, so the + * file offsets are determined from the page offsets in the bio. + * Otherwise, this is the starting file offset of the bio vecs in + * @bio, which must be contiguous. + * @one_ordered: If true, @bio only refers to one ordered extent. */ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, - u64 file_start, int contig) + u64 offset, bool one_ordered) { struct btrfs_fs_info *fs_info = inode->root->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct btrfs_ordered_sum *sums; struct btrfs_ordered_extent *ordered = NULL; + const bool use_page_offsets = (offset == (u64)-1); char *data; struct bvec_iter iter; struct bio_vec bvec; int index; - int nr_sectors; + unsigned int blockcount; unsigned long total_bytes = 0; unsigned long this_sum_bytes = 0; int i; - u64 offset; unsigned nofs_flag; nofs_flag = memalloc_nofs_save(); @@ -651,18 +660,13 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, sums->len = bio->bi_iter.bi_size; INIT_LIST_HEAD(&sums->list); - if (contig) - offset = file_start; - else - offset = 0; /* shut up gcc */ - sums->bytenr = bio->bi_iter.bi_sector << 9; index = 0; shash->tfm = fs_info->csum_shash; bio_for_each_segment(bvec, bio, iter) { - if (!contig) + if (use_page_offsets) offset = page_offset(bvec.bv_page) + bvec.bv_offset; if (!ordered) { @@ -681,13 +685,14 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, } } - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, + blockcount = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len + fs_info->sectorsize - 1); - for (i = 0; i < nr_sectors; i++) { - if (offset >= ordered->file_offset + ordered->num_bytes || - offset < ordered->file_offset) { + for (i = 0; i < blockcount; i++) { + if (!one_ordered && + !in_range(offset, ordered->file_offset, + ordered->num_bytes)) { unsigned long bytes_left; sums->len = this_sum_bytes; @@ -1211,6 +1216,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, extent_start = key.offset; extent_end = btrfs_file_extent_end(path); em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + em->generation = btrfs_file_extent_generation(leaf, fi); if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { em->start = extent_start; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 11204dbbe053..9f455c96c974 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -50,11 +50,14 @@ struct inode_defrag { /* root objectid */ u64 root; - /* last offset we were able to defrag */ - u64 last_offset; - - /* if we've wrapped around back to zero once already */ - int cycled; + /* + * The extent size threshold for autodefrag. + * + * This value is different for compressed/non-compressed extents, + * thus needs to be passed from higher layer. + * (aka, inode_should_defrag()) + */ + u32 extent_thresh; }; static int __compare_inode_defrag(struct inode_defrag *defrag1, @@ -107,8 +110,8 @@ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, */ if (defrag->transid < entry->transid) entry->transid = defrag->transid; - if (defrag->last_offset > entry->last_offset) - entry->last_offset = defrag->last_offset; + entry->extent_thresh = min(defrag->extent_thresh, + entry->extent_thresh); return -EEXIST; } } @@ -134,7 +137,7 @@ static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) * enabled */ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode) + struct btrfs_inode *inode, u32 extent_thresh) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -160,6 +163,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, defrag->ino = btrfs_ino(inode); defrag->transid = transid; defrag->root = root->root_key.objectid; + defrag->extent_thresh = extent_thresh; spin_lock(&fs_info->defrag_inodes_lock); if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { @@ -179,34 +183,6 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, } /* - * Requeue the defrag object. If there is a defrag object that points to - * the same inode in the tree, we will merge them together (by - * __btrfs_add_inode_defrag()) and free the one that we want to requeue. - */ -static void btrfs_requeue_inode_defrag(struct btrfs_inode *inode, - struct inode_defrag *defrag) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - int ret; - - if (!__need_auto_defrag(fs_info)) - goto out; - - /* - * Here we don't check the IN_DEFRAG flag, because we need merge - * them together. - */ - spin_lock(&fs_info->defrag_inodes_lock); - ret = __btrfs_add_inode_defrag(inode, defrag); - spin_unlock(&fs_info->defrag_inodes_lock); - if (ret) - goto out; - return; -out: - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); -} - -/* * pick the defragable inode that we want, if it doesn't exist, we will get * the next one. */ @@ -278,8 +254,14 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, struct btrfs_root *inode_root; struct inode *inode; struct btrfs_ioctl_defrag_range_args range; - int num_defrag; - int ret; + int ret = 0; + u64 cur = 0; + +again: + if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) + goto cleanup; + if (!__need_auto_defrag(fs_info)) + goto cleanup; /* get the inode */ inode_root = btrfs_get_fs_root(fs_info, defrag->root, true); @@ -295,39 +277,30 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, goto cleanup; } + if (cur >= i_size_read(inode)) { + iput(inode); + goto cleanup; + } + /* do a chunk of defrag */ clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); memset(&range, 0, sizeof(range)); range.len = (u64)-1; - range.start = defrag->last_offset; + range.start = cur; + range.extent_thresh = defrag->extent_thresh; sb_start_write(fs_info->sb); - num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, + ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid, BTRFS_DEFRAG_BATCH); sb_end_write(fs_info->sb); - /* - * if we filled the whole defrag batch, there - * must be more work to do. Queue this defrag - * again - */ - if (num_defrag == BTRFS_DEFRAG_BATCH) { - defrag->last_offset = range.start; - btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag); - } else if (defrag->last_offset && !defrag->cycled) { - /* - * we didn't fill our defrag batch, but - * we didn't start at zero. Make sure we loop - * around to the start of the file. - */ - defrag->last_offset = 0; - defrag->cycled = 1; - btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag); - } else { - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - } - iput(inode); - return 0; + + if (ret < 0) + goto cleanup; + + cur = max(cur + fs_info->sectorsize, range.start); + goto again; + cleanup: kmem_cache_free(btrfs_inode_defrag_cachep, defrag); return ret; @@ -718,7 +691,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, int modify_tree = -1; int update_refs; int found = 0; - int leafs_visited = 0; struct btrfs_path *path = args->path; args->bytes_found = 0; @@ -756,7 +728,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, path->slots[0]--; } ret = 0; - leafs_visited++; next_slot: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { @@ -768,7 +739,6 @@ next_slot: ret = 0; break; } - leafs_visited++; leaf = path->nodes[0]; recow = 1; } @@ -1014,7 +984,7 @@ delete_extent_item: * which case it unlocked our path, so check path->locks[0] matches a * write lock. */ - if (!ret && args->replace_extent && leafs_visited == 1 && + if (!ret && args->replace_extent && path->locks[0] == BTRFS_WRITE_LOCK && btrfs_leaf_free_space(leaf) >= sizeof(struct btrfs_item) + args->extent_item_size) { @@ -1749,7 +1719,8 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, fs_info->sectorsize); WARN_ON(reserve_bytes == 0); ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), - reserve_bytes); + reserve_bytes, + reserve_bytes); if (ret) { if (!only_release_metadata) btrfs_free_reserved_data_space(BTRFS_I(inode), @@ -2066,12 +2037,43 @@ out: return err < 0 ? err : written; } -static ssize_t btrfs_file_write_iter(struct kiocb *iocb, - struct iov_iter *from) +static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + loff_t count; + ssize_t ret; + + btrfs_inode_lock(inode, 0); + count = encoded->len; + ret = generic_write_checks_count(iocb, &count); + if (ret == 0 && count != encoded->len) { + /* + * The write got truncated by generic_write_checks_count(). We + * can't do a partial encoded write. + */ + ret = -EFBIG; + } + if (ret || encoded->len == 0) + goto out; + + ret = btrfs_write_check(iocb, from, encoded->len); + if (ret < 0) + goto out; + + ret = btrfs_do_encoded_write(iocb, from, encoded); +out: + btrfs_inode_unlock(inode, 0); + return ret; +} + +ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded) { struct file *file = iocb->ki_filp; struct btrfs_inode *inode = BTRFS_I(file_inode(file)); - ssize_t num_written = 0; + ssize_t num_written, num_sync; const bool sync = iocb->ki_flags & IOCB_DSYNC; /* @@ -2082,22 +2084,28 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, if (BTRFS_FS_ERROR(inode->root->fs_info)) return -EROFS; - if (!(iocb->ki_flags & IOCB_DIRECT) && - (iocb->ki_flags & IOCB_NOWAIT)) + if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) return -EOPNOTSUPP; if (sync) atomic_inc(&inode->sync_writers); - if (iocb->ki_flags & IOCB_DIRECT) - num_written = btrfs_direct_write(iocb, from); - else - num_written = btrfs_buffered_write(iocb, from); + if (encoded) { + num_written = btrfs_encoded_write(iocb, from, encoded); + num_sync = encoded->len; + } else if (iocb->ki_flags & IOCB_DIRECT) { + num_written = num_sync = btrfs_direct_write(iocb, from); + } else { + num_written = num_sync = btrfs_buffered_write(iocb, from); + } btrfs_set_inode_last_sub_trans(inode); - if (num_written > 0) - num_written = generic_write_sync(iocb, num_written); + if (num_sync > 0) { + num_sync = generic_write_sync(iocb, num_sync); + if (num_sync < 0) + num_written = num_sync; + } if (sync) atomic_dec(&inode->sync_writers); @@ -2106,6 +2114,11 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, return num_written; } +static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + return btrfs_do_write_iter(iocb, from, NULL); +} + int btrfs_release_file(struct inode *inode, struct file *filp) { struct btrfs_file_private *private = filp->private_data; @@ -2501,7 +2514,7 @@ out: hole_em = alloc_extent_map(); if (!hole_em) { btrfs_drop_extent_cache(inode, offset, end - 1, 0); - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); + btrfs_set_inode_full_sync(inode); } else { hole_em->start = offset; hole_em->len = end - offset; @@ -2522,8 +2535,7 @@ out: } while (ret == -EEXIST); free_extent_map(hole_em); if (ret) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &inode->runtime_flags); + btrfs_set_inode_full_sync(inode); } return 0; @@ -2877,7 +2889,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, * maps for the replacement extents (or holes). */ if (extent_info && !extent_info->is_new_extent) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); + btrfs_set_inode_full_sync(inode); if (ret) goto out_trans; diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 655aad0f9e1c..0ae54d8c10d6 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -25,6 +25,8 @@ static struct btrfs_root *btrfs_free_space_root( .offset = 0, }; + if (btrfs_fs_incompat(block_group->fs_info, EXTENT_TREE_V2)) + key.offset = block_group->global_root_id; return btrfs_global_root(block_group->fs_info, &key); } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3b2403b6127f..aa0a60ee26cb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -66,6 +66,11 @@ struct btrfs_dio_data { struct extent_changeset *data_reserved; }; +struct btrfs_rename_ctx { + /* Output field. Stores the index number of the old directory entry. */ + u64 index; +}; + static const struct inode_operations btrfs_dir_inode_operations; static const struct inode_operations btrfs_symlink_inode_operations; static const struct inode_operations btrfs_special_inode_operations; @@ -234,12 +239,14 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, * no overlapping inline items exist in the btree */ static int insert_inline_extent(struct btrfs_trans_handle *trans, - struct btrfs_path *path, bool extent_inserted, - struct btrfs_root *root, struct inode *inode, - u64 start, size_t size, size_t compressed_size, + struct btrfs_path *path, + struct btrfs_inode *inode, bool extent_inserted, + size_t size, size_t compressed_size, int compress_type, - struct page **compressed_pages) + struct page **compressed_pages, + bool update_i_size) { + struct btrfs_root *root = inode->root; struct extent_buffer *leaf; struct page *page = NULL; char *kaddr; @@ -247,7 +254,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item *ei; int ret; size_t cur_size = size; - unsigned long offset; + u64 i_size; ASSERT((compressed_size > 0 && compressed_pages) || (compressed_size == 0 && !compressed_pages)); @@ -259,8 +266,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, struct btrfs_key key; size_t datasize; - key.objectid = btrfs_ino(BTRFS_I(inode)); - key.offset = start; + key.objectid = btrfs_ino(inode); + key.offset = 0; key.type = BTRFS_EXTENT_DATA_KEY; datasize = btrfs_file_extent_calc_inline_size(cur_size); @@ -298,12 +305,10 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_compression(leaf, ei, compress_type); } else { - page = find_get_page(inode->i_mapping, - start >> PAGE_SHIFT); + page = find_get_page(inode->vfs_inode.i_mapping, 0); btrfs_set_file_extent_compression(leaf, ei, 0); kaddr = kmap_atomic(page); - offset = offset_in_page(start); - write_extent_buffer(leaf, kaddr + offset, ptr, size); + write_extent_buffer(leaf, kaddr, ptr, size); kunmap_atomic(kaddr); put_page(page); } @@ -314,21 +319,25 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, * We align size to sectorsize for inline extents just for simplicity * sake. */ - size = ALIGN(size, root->fs_info->sectorsize); - ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size); + ret = btrfs_inode_set_file_extent_range(inode, 0, + ALIGN(size, root->fs_info->sectorsize)); if (ret) goto fail; /* - * we're an inline extent, so nobody can - * extend the file past i_size without locking - * a page we already have locked. + * We're an inline extent, so nobody can extend the file past i_size + * without locking a page we already have locked. * - * We must do any isize and inode updates - * before we unlock the pages. Otherwise we - * could end up racing with unlink. + * We must do any i_size and inode updates before we unlock the pages. + * Otherwise we could end up racing with unlink. */ - BTRFS_I(inode)->disk_i_size = inode->i_size; + i_size = i_size_read(&inode->vfs_inode); + if (update_i_size && size > i_size) { + i_size_write(&inode->vfs_inode, size); + i_size = size; + } + inode->disk_i_size = i_size; + fail: return ret; } @@ -339,35 +348,31 @@ fail: * does the checks required to make sure the data is small enough * to fit as an inline extent. */ -static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, - u64 end, size_t compressed_size, +static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, + size_t compressed_size, int compress_type, - struct page **compressed_pages) + struct page **compressed_pages, + bool update_i_size) { struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; - u64 isize = i_size_read(&inode->vfs_inode); - u64 actual_end = min(end + 1, isize); - u64 inline_len = actual_end - start; - u64 aligned_end = ALIGN(end, fs_info->sectorsize); - u64 data_len = inline_len; + u64 data_len = (compressed_size ?: size); int ret; struct btrfs_path *path; - if (compressed_size) - data_len = compressed_size; - - if (start > 0 || - actual_end > fs_info->sectorsize || + /* + * We can create an inline extent if it ends at or beyond the current + * i_size, is no larger than a sector (decompressed), and the (possibly + * compressed) data fits in a leaf and the configured maximum inline + * size. + */ + if (size < i_size_read(&inode->vfs_inode) || + size > fs_info->sectorsize || data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) || - (!compressed_size && - (actual_end & (fs_info->sectorsize - 1)) == 0) || - end + 1 < isize || - data_len > fs_info->max_inline) { + data_len > fs_info->max_inline) return 1; - } path = btrfs_alloc_path(); if (!path) @@ -381,30 +386,20 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, trans->block_rsv = &inode->block_rsv; drop_args.path = path; - drop_args.start = start; - drop_args.end = aligned_end; + drop_args.start = 0; + drop_args.end = fs_info->sectorsize; drop_args.drop_cache = true; drop_args.replace_extent = true; - - if (compressed_size && compressed_pages) - drop_args.extent_item_size = btrfs_file_extent_calc_inline_size( - compressed_size); - else - drop_args.extent_item_size = btrfs_file_extent_calc_inline_size( - inline_len); - + drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len); ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } - if (isize > actual_end) - inline_len = min_t(u64, isize, actual_end); - ret = insert_inline_extent(trans, path, drop_args.extent_inserted, - root, &inode->vfs_inode, start, - inline_len, compressed_size, - compress_type, compressed_pages); + ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted, + size, compressed_size, compress_type, + compressed_pages, update_i_size); if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, ret); goto out; @@ -413,7 +408,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, goto out; } - btrfs_update_inode_bytes(inode, inline_len, drop_args.bytes_found); + btrfs_update_inode_bytes(inode, size, drop_args.bytes_found); ret = btrfs_update_inode(trans, root, inode); if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, ret); @@ -423,7 +418,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, goto out; } - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); + btrfs_set_inode_full_sync(inode); out: /* * Don't forget to free the reserved space, as for inlined extent @@ -560,12 +555,12 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, } static inline void inode_should_defrag(struct btrfs_inode *inode, - u64 start, u64 end, u64 num_bytes, u64 small_write) + u64 start, u64 end, u64 num_bytes, u32 small_write) { /* If this is a small write inside eof, kick off a defrag */ if (num_bytes < small_write && (start > 0 || end + 1 < inode->disk_i_size)) - btrfs_add_inode_defrag(NULL, inode); + btrfs_add_inode_defrag(NULL, inode, small_write); } /* @@ -624,7 +619,6 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) again: will_compress = 0; nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; - BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0); nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED / PAGE_SIZE); @@ -735,14 +729,15 @@ cont: /* we didn't compress the entire range, try * to make an uncompressed inline extent. */ - ret = cow_file_range_inline(BTRFS_I(inode), start, end, + ret = cow_file_range_inline(BTRFS_I(inode), actual_end, 0, BTRFS_COMPRESS_NONE, - NULL); + NULL, false); } else { /* try making a compressed inline extent */ - ret = cow_file_range_inline(BTRFS_I(inode), start, end, + ret = cow_file_range_inline(BTRFS_I(inode), actual_end, total_compressed, - compress_type, pages); + compress_type, pages, + false); } if (ret <= 0) { unsigned long clear_flags = EXTENT_DELALLOC | @@ -981,11 +976,14 @@ static int submit_one_async_extent(struct btrfs_inode *inode, } free_extent_map(em); - ret = btrfs_add_ordered_extent_compress(inode, start, /* file_offset */ - ins.objectid, /* disk_bytenr */ - async_extent->ram_size, /* num_bytes */ - ins.offset, /* disk_num_bytes */ - async_extent->compress_type); + ret = btrfs_add_ordered_extent(inode, start, /* file_offset */ + async_extent->ram_size, /* num_bytes */ + async_extent->ram_size, /* ram_bytes */ + ins.objectid, /* disk_bytenr */ + ins.offset, /* disk_num_bytes */ + 0, /* offset */ + 1 << BTRFS_ORDERED_COMPRESSED, + async_extent->compress_type); if (ret) { btrfs_drop_extent_cache(inode, start, end, 0); goto out_free_reserve; @@ -1003,7 +1001,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode, async_extent->pages, /* compressed_pages */ async_extent->nr_pages, async_chunk->write_flags, - async_chunk->blkcg_css)) { + async_chunk->blkcg_css, true)) { const u64 start = async_extent->start; const u64 end = start + async_extent->ram_size - 1; @@ -1152,9 +1150,12 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * So here we skip inline extent creation completely. */ if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { + u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode), + end + 1); + /* lets try to make an inline extent */ - ret = cow_file_range_inline(inode, start, end, 0, - BTRFS_COMPRESS_NONE, NULL); + ret = cow_file_range_inline(inode, actual_end, 0, + BTRFS_COMPRESS_NONE, NULL, false); if (ret == 0) { /* * We use DO_ACCOUNTING here because we need the @@ -1234,9 +1235,10 @@ static noinline int cow_file_range(struct btrfs_inode *inode, } free_extent_map(em); - ret = btrfs_add_ordered_extent(inode, start, ins.objectid, - ram_size, cur_alloc_size, - BTRFS_ORDERED_REGULAR); + ret = btrfs_add_ordered_extent(inode, start, ram_size, ram_size, + ins.objectid, cur_alloc_size, 0, + 1 << BTRFS_ORDERED_REGULAR, + BTRFS_COMPRESS_NONE); if (ret) goto out_drop_extent_cache; @@ -1895,10 +1897,11 @@ out_check: goto error; } free_extent_map(em); - ret = btrfs_add_ordered_extent(inode, cur_offset, - disk_bytenr, num_bytes, - num_bytes, - BTRFS_ORDERED_PREALLOC); + ret = btrfs_add_ordered_extent(inode, + cur_offset, num_bytes, num_bytes, + disk_bytenr, num_bytes, 0, + 1 << BTRFS_ORDERED_PREALLOC, + BTRFS_COMPRESS_NONE); if (ret) { btrfs_drop_extent_cache(inode, cur_offset, cur_offset + num_bytes - 1, @@ -1907,9 +1910,11 @@ out_check: } } else { ret = btrfs_add_ordered_extent(inode, cur_offset, + num_bytes, num_bytes, disk_bytenr, num_bytes, - num_bytes, - BTRFS_ORDERED_NOCOW); + 0, + 1 << BTRFS_ORDERED_NOCOW, + BTRFS_COMPRESS_NONE); if (ret) goto error; } @@ -2310,7 +2315,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, u64 dio_file_offset) { - return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); + return btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false); } /* @@ -2538,10 +2543,15 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, goto out; if (bio_flags & EXTENT_BIO_COMPRESSED) { + /* + * btrfs_submit_compressed_read will handle completing + * the bio if there were any errors, so just return + * here. + */ ret = btrfs_submit_compressed_read(inode, bio, mirror_num, bio_flags); - goto out; + goto out_no_endio; } else { /* * Lookup bio sums does extra checks around whether we @@ -2562,7 +2572,7 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, 0, btrfs_submit_bio_start); goto out; } else if (!skip_sum) { - ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); + ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false); if (ret) goto out; } @@ -2575,6 +2585,7 @@ out: bio->bi_status = ret; bio_endio(bio); } +out_no_endio: return ret; } @@ -2870,6 +2881,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_key ins; u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi); u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi); + u64 offset = btrfs_stack_file_extent_offset(stack_fi); u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi); u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi); struct btrfs_drop_extents_args drop_args = { 0 }; @@ -2944,7 +2956,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, goto out; ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode), - file_pos, qgroup_reserved, &ins); + file_pos - offset, + qgroup_reserved, &ins); out: btrfs_free_path(path); @@ -2970,20 +2983,20 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, struct btrfs_ordered_extent *oe) { struct btrfs_file_extent_item stack_fi; - u64 logical_len; bool update_inode_bytes; + u64 num_bytes = oe->num_bytes; + u64 ram_bytes = oe->ram_bytes; memset(&stack_fi, 0, sizeof(stack_fi)); btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG); btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr); btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, oe->disk_num_bytes); + btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset); if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) - logical_len = oe->truncated_len; - else - logical_len = oe->num_bytes; - btrfs_set_stack_file_extent_num_bytes(&stack_fi, logical_len); - btrfs_set_stack_file_extent_ram_bytes(&stack_fi, logical_len); + num_bytes = ram_bytes = oe->truncated_len; + btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes); + btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes); btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); /* Encryption and other encoding is reserved and all 0 */ @@ -2994,6 +3007,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, * except if the ordered extent was truncated. */ update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) || + test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) || test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags); return insert_reserved_file_extent(trans, BTRFS_I(oe->inode), @@ -3028,7 +3042,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && - !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags)) + !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) && + !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags)) clear_bits |= EXTENT_DELALLOC_NEW; freespace_inode = btrfs_is_free_space_inode(inode); @@ -4062,7 +4077,8 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - const char *name, int name_len) + const char *name, int name_len, + struct btrfs_rename_ctx *rename_ctx) { struct btrfs_root *root = dir->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -4118,15 +4134,27 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, goto err; } skip_backref: + if (rename_ctx) + rename_ctx->index = index; + ret = btrfs_delete_delayed_dir_index(trans, dir, index); if (ret) { btrfs_abort_transaction(trans, ret); goto err; } - btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, - dir_ino); - btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index); + /* + * If we are in a rename context, we don't need to update anything in the + * log. That will be done later during the rename by btrfs_log_new_name(). + * Besides that, doing it here would only cause extra unncessary btree + * operations on the log tree, increasing latency for applications. + */ + if (!rename_ctx) { + btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, + dir_ino); + btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, + index); + } /* * If we have a pending delayed iput we could end up with the final iput @@ -4158,7 +4186,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, const char *name, int name_len) { int ret; - ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len); + ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len, NULL); if (!ret) { drop_nlink(&inode->vfs_inode); ret = btrfs_update_inode(trans, inode->root, inode); @@ -4565,14 +4593,21 @@ out_up_write: static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; int err = 0; struct btrfs_trans_handle *trans; u64 last_unlink_trans; if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; - if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) + if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) { + if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) { + btrfs_err(fs_info, + "extent tree v2 doesn't support snapshot deletion yet"); + return -EOPNOTSUPP; + } return btrfs_delete_subvolume(dir, dentry); + } trans = __unlink_start_trans(dir); if (IS_ERR(trans)) @@ -4611,7 +4646,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) } out: btrfs_end_transaction(trans); - btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info); + btrfs_btree_balance_dirty(fs_info); return err; } @@ -4664,7 +4699,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, goto out; } } - ret = btrfs_delalloc_reserve_metadata(inode, blocksize); + ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize); if (ret < 0) { if (!only_release_metadata) btrfs_free_reserved_data_space(inode, data_reserved, @@ -4876,8 +4911,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) cur_offset + hole_size - 1, 0); hole_em = alloc_extent_map(); if (!hole_em) { - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &inode->runtime_flags); + btrfs_set_inode_full_sync(inode); goto next; } hole_em->start = cur_offset; @@ -5046,16 +5080,17 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr } /* - * While truncating the inode pages during eviction, we get the VFS calling - * btrfs_invalidatepage() against each page of the inode. This is slow because - * the calls to btrfs_invalidatepage() result in a huge amount of calls to - * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting - * extent_state structures over and over, wasting lots of time. + * While truncating the inode pages during eviction, we get the VFS + * calling btrfs_invalidate_folio() against each folio of the inode. This + * is slow because the calls to btrfs_invalidate_folio() result in a + * huge amount of calls to lock_extent_bits() and clear_extent_bit(), + * which keep merging and splitting extent_state structures over and over, + * wasting lots of time. * - * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all - * those expensive operations on a per page basis and do only the ordered io - * finishing, while we release here the extent_map and extent_state structures, - * without the excessive merging and splitting. + * Therefore if the inode is being evicted, let btrfs_invalidate_folio() + * skip all those expensive operations on a per folio basis and do only + * the ordered io finishing, while we release here the extent_map and + * extent_state structures, without the excessive merging and splitting. */ static void evict_inode_truncate_pages(struct inode *inode) { @@ -5121,7 +5156,7 @@ static void evict_inode_truncate_pages(struct inode *inode) * If still has DELALLOC flag, the extent didn't reach disk, * and its reserved space won't be freed by delayed_ref. * So we need to free its reserved space here. - * (Refer to comment in btrfs_invalidatepage, case 2) + * (Refer to comment in btrfs_invalidate_folio, case 2) * * Note, end is the bytenr of last byte, so we need + 1 here. */ @@ -5584,21 +5619,17 @@ static struct inode *new_simple_dir(struct super_block *s, return inode; } +static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN); +static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE); +static_assert(BTRFS_FT_DIR == FT_DIR); +static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV); +static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV); +static_assert(BTRFS_FT_FIFO == FT_FIFO); +static_assert(BTRFS_FT_SOCK == FT_SOCK); +static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK); + static inline u8 btrfs_inode_type(struct inode *inode) { - /* - * Compile-time asserts that generic FT_* types still match - * BTRFS_FT_* types - */ - BUILD_BUG_ON(BTRFS_FT_UNKNOWN != FT_UNKNOWN); - BUILD_BUG_ON(BTRFS_FT_REG_FILE != FT_REG_FILE); - BUILD_BUG_ON(BTRFS_FT_DIR != FT_DIR); - BUILD_BUG_ON(BTRFS_FT_CHRDEV != FT_CHRDEV); - BUILD_BUG_ON(BTRFS_FT_BLKDEV != FT_BLKDEV); - BUILD_BUG_ON(BTRFS_FT_FIFO != FT_FIFO); - BUILD_BUG_ON(BTRFS_FT_SOCK != FT_SOCK); - BUILD_BUG_ON(BTRFS_FT_SYMLINK != FT_SYMLINK); - return fs_umode_to_ftype(inode->i_mode); } @@ -5971,14 +6002,8 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode) goto out; ret = 0; - /* - * MAGIC NUMBER EXPLANATION: - * since we search a directory based on f_pos we have to start at 2 - * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody - * else has to start at 2 - */ if (path->slots[0] == 0) { - inode->index_cnt = 2; + inode->index_cnt = BTRFS_DIR_START_INDEX; goto out; } @@ -5989,7 +6014,7 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode) if (found_key.objectid != btrfs_ino(inode) || found_key.type != BTRFS_DIR_INDEX_KEY) { - inode->index_cnt = 2; + inode->index_cnt = BTRFS_DIR_START_INDEX; goto out; } @@ -6140,7 +6165,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, * sync since it will be a full sync anyway and this will blow away the * old info in the log. */ - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); + btrfs_set_inode_full_sync(BTRFS_I(inode)); key[0].objectid = objectid; key[0].type = BTRFS_INODE_ITEM_KEY; @@ -6537,7 +6562,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, goto fail; } d_instantiate(dentry, inode); - btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent); + btrfs_log_new_name(trans, old_dentry, NULL, 0, parent); } fail: @@ -7040,8 +7065,11 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, if (IS_ERR(em)) goto out; } - ret = btrfs_add_ordered_extent_dio(inode, start, block_start, len, - block_len, type); + ret = btrfs_add_ordered_extent(inode, start, len, len, block_start, + block_len, 0, + (1 << type) | + (1 << BTRFS_ORDERED_DIRECT), + BTRFS_COMPRESS_NONE); if (ret) { if (em) { free_extent_map(em); @@ -7441,7 +7469,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, struct extent_map *em2; /* We can NOCOW, so only need to reserve metadata space. */ - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len); if (ret < 0) { /* Our caller expects us to free the input extent map. */ free_extent_map(em); @@ -7600,6 +7628,34 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, } len = min(len, em->len - (start - em->start)); + + /* + * If we have a NOWAIT request and the range contains multiple extents + * (or a mix of extents and holes), then we return -EAGAIN to make the + * caller fallback to a context where it can do a blocking (without + * NOWAIT) request. This way we avoid doing partial IO and returning + * success to the caller, which is not optimal for writes and for reads + * it can result in unexpected behaviour for an application. + * + * When doing a read, because we use IOMAP_DIO_PARTIAL when calling + * iomap_dio_rw(), we can end up returning less data then what the caller + * asked for, resulting in an unexpected, and incorrect, short read. + * That is, the caller asked to read N bytes and we return less than that, + * which is wrong unless we are crossing EOF. This happens if we get a + * page fault error when trying to fault in pages for the buffer that is + * associated to the struct iov_iter passed to iomap_dio_rw(), and we + * have previously submitted bios for other extents in the range, in + * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of + * those bios have completed by the time we get the page fault error, + * which we return back to our caller - we should only return EIOCBQUEUED + * after we have submitted bios for all the extents in the range. + */ + if ((flags & IOMAP_NOWAIT) && len < length) { + free_extent_map(em); + ret = -EAGAIN; + goto unlock_err; + } + if (write) { ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, start, len); @@ -7803,7 +7859,7 @@ static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, struct bio *bio, u64 dio_file_offset) { - return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, 1); + return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, false); } static void btrfs_end_dio_bio(struct bio *bio) @@ -7860,7 +7916,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, * If we aren't doing async submit, calculate the csum of the * bio now. */ - ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, 1); + ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false); if (ret) goto err; } else { @@ -8076,8 +8132,13 @@ int btrfs_readpage(struct file *file, struct page *page) btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL); - if (bio_ctrl.bio) - ret = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags); + if (bio_ctrl.bio) { + int ret2; + + ret2 = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags); + if (ret == 0) + ret = ret2; + } return ret; } @@ -8118,8 +8179,8 @@ static void btrfs_readahead(struct readahead_control *rac) } /* - * For releasepage() and invalidatepage() we have a race window where - * end_page_writeback() is called but the subpage spinlock is not yet released. + * For releasepage() and invalidate_folio() we have a race window where + * folio_end_writeback() is called but the subpage spinlock is not yet released. * If we continue to release/invalidate the page, we could cause use-after-free * for subpage spinlock. So this function is to spin and wait for subpage * spinlock. @@ -8195,48 +8256,48 @@ static int btrfs_migratepage(struct address_space *mapping, } #endif -static void btrfs_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) +static void btrfs_invalidate_folio(struct folio *folio, size_t offset, + size_t length) { - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct btrfs_inode *inode = BTRFS_I(folio->mapping->host); struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *tree = &inode->io_tree; struct extent_state *cached_state = NULL; - u64 page_start = page_offset(page); - u64 page_end = page_start + PAGE_SIZE - 1; + u64 page_start = folio_pos(folio); + u64 page_end = page_start + folio_size(folio) - 1; u64 cur; int inode_evicting = inode->vfs_inode.i_state & I_FREEING; /* - * We have page locked so no new ordered extent can be created on this - * page, nor bio can be submitted for this page. + * We have folio locked so no new ordered extent can be created on this + * page, nor bio can be submitted for this folio. * - * But already submitted bio can still be finished on this page. - * Furthermore, endio function won't skip page which has Ordered + * But already submitted bio can still be finished on this folio. + * Furthermore, endio function won't skip folio which has Ordered * (Private2) already cleared, so it's possible for endio and - * invalidatepage to do the same ordered extent accounting twice - * on one page. + * invalidate_folio to do the same ordered extent accounting twice + * on one folio. * * So here we wait for any submitted bios to finish, so that we won't - * do double ordered extent accounting on the same page. + * do double ordered extent accounting on the same folio. */ - wait_on_page_writeback(page); - wait_subpage_spinlock(page); + folio_wait_writeback(folio); + wait_subpage_spinlock(&folio->page); /* * For subpage case, we have call sites like * btrfs_punch_hole_lock_range() which passes range not aligned to * sectorsize. - * If the range doesn't cover the full page, we don't need to and - * shouldn't clear page extent mapped, as page->private can still + * If the range doesn't cover the full folio, we don't need to and + * shouldn't clear page extent mapped, as folio->private can still * record subpage dirty bits for other part of the range. * - * For cases that can invalidate the full even the range doesn't - * cover the full page, like invalidating the last page, we're + * For cases that invalidate the full folio even the range doesn't + * cover the full folio, like invalidating the last folio, we're * still safe to wait for ordered extent to finish. */ if (!(offset == 0 && length == PAGE_SIZE)) { - btrfs_releasepage(page, GFP_NOFS); + btrfs_releasepage(&folio->page, GFP_NOFS); return; } @@ -8277,7 +8338,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, page_end); ASSERT(range_end + 1 - cur < U32_MAX); range_len = range_end + 1 - cur; - if (!btrfs_page_test_ordered(fs_info, page, cur, range_len)) { + if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) { /* * If Ordered (Private2) is cleared, it means endio has * already been executed for the range. @@ -8287,7 +8348,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, delete_states = false; goto next; } - btrfs_page_clear_ordered(fs_info, page, cur, range_len); + btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len); /* * IO on this page will never be started, so we need to account @@ -8357,11 +8418,11 @@ next: * should not have Ordered (Private2) anymore, or the above iteration * did something wrong. */ - ASSERT(!PageOrdered(page)); - btrfs_page_clear_checked(fs_info, page, page_offset(page), PAGE_SIZE); + ASSERT(!folio_test_ordered(folio)); + btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio)); if (!inode_evicting) - __btrfs_releasepage(page, GFP_NOFS); - clear_page_extent_mapped(page); + __btrfs_releasepage(&folio->page, GFP_NOFS); + clear_page_extent_mapped(&folio->page); } /* @@ -8706,7 +8767,7 @@ out: * extents beyond i_size to drop. */ if (control.extents_found > 0) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); + btrfs_set_inode_full_sync(BTRFS_I(inode)); return ret; } @@ -8759,7 +8820,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) struct btrfs_inode *ei; struct inode *inode; - ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; @@ -9002,14 +9063,14 @@ static int btrfs_rename_exchange(struct inode *old_dir, struct inode *new_inode = new_dentry->d_inode; struct inode *old_inode = old_dentry->d_inode; struct timespec64 ctime = current_time(old_inode); + struct btrfs_rename_ctx old_rename_ctx; + struct btrfs_rename_ctx new_rename_ctx; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); u64 new_ino = btrfs_ino(BTRFS_I(new_inode)); u64 old_idx = 0; u64 new_idx = 0; int ret; int ret2; - bool root_log_pinned = false; - bool dest_log_pinned = false; bool need_abort = false; /* @@ -9112,29 +9173,6 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(new_inode), 1); } - /* - * Now pin the logs of the roots. We do it to ensure that no other task - * can sync the logs while we are in progress with the rename, because - * that could result in an inconsistency in case any of the inodes that - * are part of this rename operation were logged before. - * - * We pin the logs even if at this precise moment none of the inodes was - * logged before. This is because right after we checked for that, some - * other task fsyncing some other inode not involved with this rename - * operation could log that one of our inodes exists. - * - * We don't need to pin the logs before the above calls to - * btrfs_insert_inode_ref(), since those don't ever need to change a log. - */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { - btrfs_pin_log_trans(root); - root_log_pinned = true; - } - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) { - btrfs_pin_log_trans(dest); - dest_log_pinned = true; - } - /* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); @@ -9142,7 +9180,8 @@ static int btrfs_rename_exchange(struct inode *old_dir, ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), old_dentry->d_name.name, - old_dentry->d_name.len); + old_dentry->d_name.len, + &old_rename_ctx); if (!ret) ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); } @@ -9158,7 +9197,8 @@ static int btrfs_rename_exchange(struct inode *old_dir, ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), new_dentry->d_name.name, - new_dentry->d_name.len); + new_dentry->d_name.len, + &new_rename_ctx); if (!ret) ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode)); } @@ -9188,46 +9228,31 @@ static int btrfs_rename_exchange(struct inode *old_dir, if (new_inode->i_nlink == 1) BTRFS_I(new_inode)->dir_index = new_idx; - if (root_log_pinned) { - btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), - new_dentry->d_parent); - btrfs_end_log_trans(root); - root_log_pinned = false; - } - if (dest_log_pinned) { - btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir), - old_dentry->d_parent); - btrfs_end_log_trans(dest); - dest_log_pinned = false; - } -out_fail: /* - * If we have pinned a log and an error happened, we unpin tasks - * trying to sync the log and force them to fallback to a transaction - * commit if the log currently contains any of the inodes involved in - * this rename operation (to ensure we do not persist a log with an - * inconsistent state for any of these inodes or leading to any - * inconsistencies when replayed). If the transaction was aborted, the - * abortion reason is propagated to userspace when attempting to commit - * the transaction. If the log does not contain any of these inodes, we - * allow the tasks to sync it. + * Now pin the logs of the roots. We do it to ensure that no other task + * can sync the logs while we are in progress with the rename, because + * that could result in an inconsistency in case any of the inodes that + * are part of this rename operation were logged before. */ - if (ret && (root_log_pinned || dest_log_pinned)) { - if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) || - btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) || - btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || - btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)) - btrfs_set_log_full_commit(trans); + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_pin_log_trans(root); + if (new_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_pin_log_trans(dest); - if (root_log_pinned) { - btrfs_end_log_trans(root); - root_log_pinned = false; - } - if (dest_log_pinned) { - btrfs_end_log_trans(dest); - dest_log_pinned = false; - } - } + /* Do the log updates for all inodes. */ + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), + old_rename_ctx.index, new_dentry->d_parent); + if (new_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir), + new_rename_ctx.index, old_dentry->d_parent); + + /* Now unpin the logs. */ + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_end_log_trans(root); + if (new_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_end_log_trans(dest); +out_fail: ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: @@ -9302,11 +9327,11 @@ static int btrfs_rename(struct user_namespace *mnt_userns, struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = d_inode(new_dentry); struct inode *old_inode = d_inode(old_dentry); + struct btrfs_rename_ctx rename_ctx; u64 index = 0; int ret; int ret2; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); - bool log_pinned = false; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -9411,29 +9436,11 @@ static int btrfs_rename(struct user_namespace *mnt_userns, if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); } else { - /* - * Now pin the log. We do it to ensure that no other task can - * sync the log while we are in progress with the rename, as - * that could result in an inconsistency in case any of the - * inodes that are part of this rename operation were logged - * before. - * - * We pin the log even if at this precise moment none of the - * inodes was logged before. This is because right after we - * checked for that, some other task fsyncing some other inode - * not involved with this rename operation could log that one of - * our inodes exists. - * - * We don't need to pin the logs before the above call to - * btrfs_insert_inode_ref(), since that does not need to change - * a log. - */ - btrfs_pin_log_trans(root); - log_pinned = true; ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), old_dentry->d_name.name, - old_dentry->d_name.len); + old_dentry->d_name.len, + &rename_ctx); if (!ret) ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); } @@ -9475,12 +9482,9 @@ static int btrfs_rename(struct user_namespace *mnt_userns, if (old_inode->i_nlink == 1) BTRFS_I(old_inode)->dir_index = index; - if (log_pinned) { - btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), - new_dentry->d_parent); - btrfs_end_log_trans(root); - log_pinned = false; - } + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), + rename_ctx.index, new_dentry->d_parent); if (flags & RENAME_WHITEOUT) { ret = btrfs_whiteout_for_rename(trans, root, mnt_userns, @@ -9492,28 +9496,6 @@ static int btrfs_rename(struct user_namespace *mnt_userns, } } out_fail: - /* - * If we have pinned the log and an error happened, we unpin tasks - * trying to sync the log and force them to fallback to a transaction - * commit if the log currently contains any of the inodes involved in - * this rename operation (to ensure we do not persist a log with an - * inconsistent state for any of these inodes or leading to any - * inconsistencies when replayed). If the transaction was aborted, the - * abortion reason is propagated to userspace when attempting to commit - * the transaction. If the log does not contain any of these inodes, we - * allow the tasks to sync it. - */ - if (ret && log_pinned) { - if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) || - btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) || - btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || - (new_inode && - btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))) - btrfs_set_log_full_commit(trans); - - btrfs_end_log_trans(root); - log_pinned = false; - } ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: @@ -9993,8 +9975,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, em = alloc_extent_map(); if (!em) { - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + btrfs_set_inode_full_sync(BTRFS_I(inode)); goto next; } @@ -10076,11 +10057,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, min_size, actual_len, alloc_hint, trans); } -static int btrfs_set_page_dirty(struct page *page) -{ - return __set_page_dirty_nobuffers(page); -} - static int btrfs_permission(struct user_namespace *mnt_userns, struct inode *inode, int mask) { @@ -10182,6 +10158,747 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) } } +static int btrfs_encoded_io_compression_from_extent( + struct btrfs_fs_info *fs_info, + int compress_type) +{ + switch (compress_type) { + case BTRFS_COMPRESS_NONE: + return BTRFS_ENCODED_IO_COMPRESSION_NONE; + case BTRFS_COMPRESS_ZLIB: + return BTRFS_ENCODED_IO_COMPRESSION_ZLIB; + case BTRFS_COMPRESS_LZO: + /* + * The LZO format depends on the sector size. 64K is the maximum + * sector size that we support. + */ + if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K) + return -EINVAL; + return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + + (fs_info->sectorsize_bits - 12); + case BTRFS_COMPRESS_ZSTD: + return BTRFS_ENCODED_IO_COMPRESSION_ZSTD; + default: + return -EUCLEAN; + } +} + +static ssize_t btrfs_encoded_read_inline( + struct kiocb *iocb, + struct iov_iter *iter, u64 start, + u64 lockend, + struct extent_state **cached_state, + u64 extent_start, size_t count, + struct btrfs_ioctl_encoded_io_args *encoded, + bool *unlocked) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *item; + u64 ram_bytes; + unsigned long ptr; + void *tmp; + ssize_t ret; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), + extent_start, 0); + if (ret) { + if (ret > 0) { + /* The extent item disappeared? */ + ret = -EIO; + } + goto out; + } + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + + ram_bytes = btrfs_file_extent_ram_bytes(leaf, item); + ptr = btrfs_file_extent_inline_start(item); + + encoded->len = min_t(u64, extent_start + ram_bytes, + inode->vfs_inode.i_size) - iocb->ki_pos; + ret = btrfs_encoded_io_compression_from_extent(fs_info, + btrfs_file_extent_compression(leaf, item)); + if (ret < 0) + goto out; + encoded->compression = ret; + if (encoded->compression) { + size_t inline_size; + + inline_size = btrfs_file_extent_inline_item_len(leaf, + path->slots[0]); + if (inline_size > count) { + ret = -ENOBUFS; + goto out; + } + count = inline_size; + encoded->unencoded_len = ram_bytes; + encoded->unencoded_offset = iocb->ki_pos - extent_start; + } else { + count = min_t(u64, count, encoded->len); + encoded->len = count; + encoded->unencoded_len = count; + ptr += iocb->ki_pos - extent_start; + } + + tmp = kmalloc(count, GFP_NOFS); + if (!tmp) { + ret = -ENOMEM; + goto out; + } + read_extent_buffer(leaf, tmp, ptr, count); + btrfs_release_path(path); + unlock_extent_cached(io_tree, start, lockend, cached_state); + btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + *unlocked = true; + + ret = copy_to_iter(tmp, count, iter); + if (ret != count) + ret = -EFAULT; + kfree(tmp); +out: + btrfs_free_path(path); + return ret; +} + +struct btrfs_encoded_read_private { + struct btrfs_inode *inode; + u64 file_offset; + wait_queue_head_t wait; + atomic_t pending; + blk_status_t status; + bool skip_csum; +}; + +static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode, + struct bio *bio, int mirror_num) +{ + struct btrfs_encoded_read_private *priv = bio->bi_private; + struct btrfs_bio *bbio = btrfs_bio(bio); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + blk_status_t ret; + + if (!priv->skip_csum) { + ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL); + if (ret) + return ret; + } + + ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); + if (ret) { + btrfs_bio_free_csum(bbio); + return ret; + } + + atomic_inc(&priv->pending); + ret = btrfs_map_bio(fs_info, bio, mirror_num); + if (ret) { + atomic_dec(&priv->pending); + btrfs_bio_free_csum(bbio); + } + return ret; +} + +static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) +{ + const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK); + struct btrfs_encoded_read_private *priv = bbio->bio.bi_private; + struct btrfs_inode *inode = priv->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + u32 sectorsize = fs_info->sectorsize; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + u64 start = priv->file_offset; + u32 bio_offset = 0; + + if (priv->skip_csum || !uptodate) + return bbio->bio.bi_status; + + bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { + unsigned int i, nr_sectors, pgoff; + + nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); + pgoff = bvec->bv_offset; + for (i = 0; i < nr_sectors; i++) { + ASSERT(pgoff < PAGE_SIZE); + if (check_data_csum(&inode->vfs_inode, bbio, bio_offset, + bvec->bv_page, pgoff, start)) + return BLK_STS_IOERR; + start += sectorsize; + bio_offset += sectorsize; + pgoff += sectorsize; + } + } + return BLK_STS_OK; +} + +static void btrfs_encoded_read_endio(struct bio *bio) +{ + struct btrfs_encoded_read_private *priv = bio->bi_private; + struct btrfs_bio *bbio = btrfs_bio(bio); + blk_status_t status; + + status = btrfs_encoded_read_verify_csum(bbio); + if (status) { + /* + * The memory barrier implied by the atomic_dec_return() here + * pairs with the memory barrier implied by the + * atomic_dec_return() or io_wait_event() in + * btrfs_encoded_read_regular_fill_pages() to ensure that this + * write is observed before the load of status in + * btrfs_encoded_read_regular_fill_pages(). + */ + WRITE_ONCE(priv->status, status); + } + if (!atomic_dec_return(&priv->pending)) + wake_up(&priv->wait); + btrfs_bio_free_csum(bbio); + bio_put(bio); +} + +static int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, + u64 file_offset, + u64 disk_bytenr, + u64 disk_io_size, + struct page **pages) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_encoded_read_private priv = { + .inode = inode, + .file_offset = file_offset, + .pending = ATOMIC_INIT(1), + .skip_csum = (inode->flags & BTRFS_INODE_NODATASUM), + }; + unsigned long i = 0; + u64 cur = 0; + int ret; + + init_waitqueue_head(&priv.wait); + /* + * Submit bios for the extent, splitting due to bio or stripe limits as + * necessary. + */ + while (cur < disk_io_size) { + struct extent_map *em; + struct btrfs_io_geometry geom; + struct bio *bio = NULL; + u64 remaining; + + em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur, + disk_io_size - cur); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + } else { + ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ, + disk_bytenr + cur, &geom); + free_extent_map(em); + } + if (ret) { + WRITE_ONCE(priv.status, errno_to_blk_status(ret)); + break; + } + remaining = min(geom.len, disk_io_size - cur); + while (bio || remaining) { + size_t bytes = min_t(u64, remaining, PAGE_SIZE); + + if (!bio) { + bio = btrfs_bio_alloc(BIO_MAX_VECS); + bio->bi_iter.bi_sector = + (disk_bytenr + cur) >> SECTOR_SHIFT; + bio->bi_end_io = btrfs_encoded_read_endio; + bio->bi_private = &priv; + bio->bi_opf = REQ_OP_READ; + } + + if (!bytes || + bio_add_page(bio, pages[i], bytes, 0) < bytes) { + blk_status_t status; + + status = submit_encoded_read_bio(inode, bio, 0); + if (status) { + WRITE_ONCE(priv.status, status); + bio_put(bio); + goto out; + } + bio = NULL; + continue; + } + + i++; + cur += bytes; + remaining -= bytes; + } + } + +out: + if (atomic_dec_return(&priv.pending)) + io_wait_event(priv.wait, !atomic_read(&priv.pending)); + /* See btrfs_encoded_read_endio() for ordering. */ + return blk_status_to_errno(READ_ONCE(priv.status)); +} + +static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, + struct iov_iter *iter, + u64 start, u64 lockend, + struct extent_state **cached_state, + u64 disk_bytenr, u64 disk_io_size, + size_t count, bool compressed, + bool *unlocked) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); + struct extent_io_tree *io_tree = &inode->io_tree; + struct page **pages; + unsigned long nr_pages, i; + u64 cur; + size_t page_offset; + ssize_t ret; + + nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE); + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!pages) + return -ENOMEM; + for (i = 0; i < nr_pages; i++) { + pages[i] = alloc_page(GFP_NOFS); + if (!pages[i]) { + ret = -ENOMEM; + goto out; + } + } + + ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr, + disk_io_size, pages); + if (ret) + goto out; + + unlock_extent_cached(io_tree, start, lockend, cached_state); + btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + *unlocked = true; + + if (compressed) { + i = 0; + page_offset = 0; + } else { + i = (iocb->ki_pos - start) >> PAGE_SHIFT; + page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1); + } + cur = 0; + while (cur < count) { + size_t bytes = min_t(size_t, count - cur, + PAGE_SIZE - page_offset); + + if (copy_page_to_iter(pages[i], page_offset, bytes, + iter) != bytes) { + ret = -EFAULT; + goto out; + } + i++; + cur += bytes; + page_offset = 0; + } + ret = count; +out: + for (i = 0; i < nr_pages; i++) { + if (pages[i]) + __free_page(pages[i]); + } + kfree(pages); + return ret; +} + +ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, + struct btrfs_ioctl_encoded_io_args *encoded) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; + ssize_t ret; + size_t count = iov_iter_count(iter); + u64 start, lockend, disk_bytenr, disk_io_size; + struct extent_state *cached_state = NULL; + struct extent_map *em; + bool unlocked = false; + + file_accessed(iocb->ki_filp); + + btrfs_inode_lock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + + if (iocb->ki_pos >= inode->vfs_inode.i_size) { + btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + return 0; + } + start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize); + /* + * We don't know how long the extent containing iocb->ki_pos is, but if + * it's compressed we know that it won't be longer than this. + */ + lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; + + for (;;) { + struct btrfs_ordered_extent *ordered; + + ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, + lockend - start + 1); + if (ret) + goto out_unlock_inode; + lock_extent_bits(io_tree, start, lockend, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, start, + lockend - start + 1); + if (!ordered) + break; + btrfs_put_ordered_extent(ordered); + unlock_extent_cached(io_tree, start, lockend, &cached_state); + cond_resched(); + } + + em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out_unlock_extent; + } + + if (em->block_start == EXTENT_MAP_INLINE) { + u64 extent_start = em->start; + + /* + * For inline extents we get everything we need out of the + * extent item. + */ + free_extent_map(em); + em = NULL; + ret = btrfs_encoded_read_inline(iocb, iter, start, lockend, + &cached_state, extent_start, + count, encoded, &unlocked); + goto out; + } + + /* + * We only want to return up to EOF even if the extent extends beyond + * that. + */ + encoded->len = min_t(u64, extent_map_end(em), + inode->vfs_inode.i_size) - iocb->ki_pos; + if (em->block_start == EXTENT_MAP_HOLE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + disk_bytenr = EXTENT_MAP_HOLE; + count = min_t(u64, count, encoded->len); + encoded->len = count; + encoded->unencoded_len = count; + } else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + disk_bytenr = em->block_start; + /* + * Bail if the buffer isn't large enough to return the whole + * compressed extent. + */ + if (em->block_len > count) { + ret = -ENOBUFS; + goto out_em; + } + disk_io_size = count = em->block_len; + encoded->unencoded_len = em->ram_bytes; + encoded->unencoded_offset = iocb->ki_pos - em->orig_start; + ret = btrfs_encoded_io_compression_from_extent(fs_info, + em->compress_type); + if (ret < 0) + goto out_em; + encoded->compression = ret; + } else { + disk_bytenr = em->block_start + (start - em->start); + if (encoded->len > count) + encoded->len = count; + /* + * Don't read beyond what we locked. This also limits the page + * allocations that we'll do. + */ + disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start; + count = start + disk_io_size - iocb->ki_pos; + encoded->len = count; + encoded->unencoded_len = count; + disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize); + } + free_extent_map(em); + em = NULL; + + if (disk_bytenr == EXTENT_MAP_HOLE) { + unlock_extent_cached(io_tree, start, lockend, &cached_state); + btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + unlocked = true; + ret = iov_iter_zero(count, iter); + if (ret != count) + ret = -EFAULT; + } else { + ret = btrfs_encoded_read_regular(iocb, iter, start, lockend, + &cached_state, disk_bytenr, + disk_io_size, count, + encoded->compression, + &unlocked); + } + +out: + if (ret >= 0) + iocb->ki_pos += encoded->len; +out_em: + free_extent_map(em); +out_unlock_extent: + if (!unlocked) + unlock_extent_cached(io_tree, start, lockend, &cached_state); +out_unlock_inode: + if (!unlocked) + btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + return ret; +} + +ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; + struct extent_changeset *data_reserved = NULL; + struct extent_state *cached_state = NULL; + int compression; + size_t orig_count; + u64 start, end; + u64 num_bytes, ram_bytes, disk_num_bytes; + unsigned long nr_pages, i; + struct page **pages; + struct btrfs_key ins; + bool extent_reserved = false; + struct extent_map *em; + ssize_t ret; + + switch (encoded->compression) { + case BTRFS_ENCODED_IO_COMPRESSION_ZLIB: + compression = BTRFS_COMPRESS_ZLIB; + break; + case BTRFS_ENCODED_IO_COMPRESSION_ZSTD: + compression = BTRFS_COMPRESS_ZSTD; + break; + case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K: + case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K: + case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K: + case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K: + case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K: + /* The sector size must match for LZO. */ + if (encoded->compression - + BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 != + fs_info->sectorsize_bits) + return -EINVAL; + compression = BTRFS_COMPRESS_LZO; + break; + default: + return -EINVAL; + } + if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE) + return -EINVAL; + + orig_count = iov_iter_count(from); + + /* The extent size must be sane. */ + if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED || + orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0) + return -EINVAL; + + /* + * The compressed data must be smaller than the decompressed data. + * + * It's of course possible for data to compress to larger or the same + * size, but the buffered I/O path falls back to no compression for such + * data, and we don't want to break any assumptions by creating these + * extents. + * + * Note that this is less strict than the current check we have that the + * compressed data must be at least one sector smaller than the + * decompressed data. We only want to enforce the weaker requirement + * from old kernels that it is at least one byte smaller. + */ + if (orig_count >= encoded->unencoded_len) + return -EINVAL; + + /* The extent must start on a sector boundary. */ + start = iocb->ki_pos; + if (!IS_ALIGNED(start, fs_info->sectorsize)) + return -EINVAL; + + /* + * The extent must end on a sector boundary. However, we allow a write + * which ends at or extends i_size to have an unaligned length; we round + * up the extent size and set i_size to the unaligned end. + */ + if (start + encoded->len < inode->vfs_inode.i_size && + !IS_ALIGNED(start + encoded->len, fs_info->sectorsize)) + return -EINVAL; + + /* Finally, the offset in the unencoded data must be sector-aligned. */ + if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize)) + return -EINVAL; + + num_bytes = ALIGN(encoded->len, fs_info->sectorsize); + ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize); + end = start + num_bytes - 1; + + /* + * If the extent cannot be inline, the compressed data on disk must be + * sector-aligned. For convenience, we extend it with zeroes if it + * isn't. + */ + disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize); + nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); + pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT); + if (!pages) + return -ENOMEM; + for (i = 0; i < nr_pages; i++) { + size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from)); + char *kaddr; + + pages[i] = alloc_page(GFP_KERNEL_ACCOUNT); + if (!pages[i]) { + ret = -ENOMEM; + goto out_pages; + } + kaddr = kmap(pages[i]); + if (copy_from_iter(kaddr, bytes, from) != bytes) { + kunmap(pages[i]); + ret = -EFAULT; + goto out_pages; + } + if (bytes < PAGE_SIZE) + memset(kaddr + bytes, 0, PAGE_SIZE - bytes); + kunmap(pages[i]); + } + + for (;;) { + struct btrfs_ordered_extent *ordered; + + ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes); + if (ret) + goto out_pages; + ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping, + start >> PAGE_SHIFT, + end >> PAGE_SHIFT); + if (ret) + goto out_pages; + lock_extent_bits(io_tree, start, end, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, start, num_bytes); + if (!ordered && + !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end)) + break; + if (ordered) + btrfs_put_ordered_extent(ordered); + unlock_extent_cached(io_tree, start, end, &cached_state); + cond_resched(); + } + + /* + * We don't use the higher-level delalloc space functions because our + * num_bytes and disk_num_bytes are different. + */ + ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes); + if (ret) + goto out_unlock; + ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes); + if (ret) + goto out_free_data_space; + ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes); + if (ret) + goto out_qgroup_free_data; + + /* Try an inline extent first. */ + if (start == 0 && encoded->unencoded_len == encoded->len && + encoded->unencoded_offset == 0) { + ret = cow_file_range_inline(inode, encoded->len, orig_count, + compression, pages, true); + if (ret <= 0) { + if (ret == 0) + ret = orig_count; + goto out_delalloc_release; + } + } + + ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes, + disk_num_bytes, 0, 0, &ins, 1, 1); + if (ret) + goto out_delalloc_release; + extent_reserved = true; + + em = create_io_em(inode, start, num_bytes, + start - encoded->unencoded_offset, ins.objectid, + ins.offset, ins.offset, ram_bytes, compression, + BTRFS_ORDERED_COMPRESSED); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out_free_reserved; + } + free_extent_map(em); + + ret = btrfs_add_ordered_extent(inode, start, num_bytes, ram_bytes, + ins.objectid, ins.offset, + encoded->unencoded_offset, + (1 << BTRFS_ORDERED_ENCODED) | + (1 << BTRFS_ORDERED_COMPRESSED), + compression); + if (ret) { + btrfs_drop_extent_cache(inode, start, end, 0); + goto out_free_reserved; + } + btrfs_dec_block_group_reservations(fs_info, ins.objectid); + + if (start + encoded->len > inode->vfs_inode.i_size) + i_size_write(&inode->vfs_inode, start + encoded->len); + + unlock_extent_cached(io_tree, start, end, &cached_state); + + btrfs_delalloc_release_extents(inode, num_bytes); + + if (btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid, + ins.offset, pages, nr_pages, 0, NULL, + false)) { + btrfs_writepage_endio_finish_ordered(inode, pages[0], start, end, 0); + ret = -EIO; + goto out_pages; + } + ret = orig_count; + goto out; + +out_free_reserved: + btrfs_dec_block_group_reservations(fs_info, ins.objectid); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); +out_delalloc_release: + btrfs_delalloc_release_extents(inode, num_bytes); + btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0); +out_qgroup_free_data: + if (ret < 0) + btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes); +out_free_data_space: + /* + * If btrfs_reserve_extent() succeeded, then we already decremented + * bytes_may_use. + */ + if (!extent_reserved) + btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes); +out_unlock: + unlock_extent_cached(io_tree, start, end, &cached_state); +out_pages: + for (i = 0; i < nr_pages; i++) { + if (pages[i]) + __free_page(pages[i]); + } + kvfree(pages); +out: + if (ret >= 0) + iocb->ki_pos += encoded->len; + return ret; +} + #ifdef CONFIG_SWAP /* * Add an entry indicating a block group or device which is pinned by a @@ -10638,12 +11355,12 @@ static const struct address_space_operations btrfs_aops = { .writepages = btrfs_writepages, .readahead = btrfs_readahead, .direct_IO = noop_direct_IO, - .invalidatepage = btrfs_invalidatepage, + .invalidate_folio = btrfs_invalidate_folio, .releasepage = btrfs_releasepage, #ifdef CONFIG_MIGRATION .migratepage = btrfs_migratepage, #endif - .set_page_dirty = btrfs_set_page_dirty, + .dirty_folio = filemap_dirty_folio, .error_remove_page = generic_error_remove_page, .swap_activate = btrfs_swap_activate, .swap_deactivate = btrfs_swap_deactivate, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a5bd6926f7ff..238cee5b5254 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -28,6 +28,7 @@ #include <linux/iversion.h> #include <linux/fileattr.h> #include <linux/fsverity.h> +#include <linux/sched/xacct.h> #include "ctree.h" #include "disk-io.h" #include "export.h" @@ -88,6 +89,24 @@ struct btrfs_ioctl_send_args_32 { #define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \ struct btrfs_ioctl_send_args_32) + +struct btrfs_ioctl_encoded_io_args_32 { + compat_uptr_t iov; + compat_ulong_t iovcnt; + __s64 offset; + __u64 flags; + __u64 len; + __u64 unencoded_len; + __u64 unencoded_offset; + __u32 compression; + __u32 encryption; + __u8 reserved[64]; +}; + +#define BTRFS_IOC_ENCODED_READ_32 _IOR(BTRFS_IOCTL_MAGIC, 64, \ + struct btrfs_ioctl_encoded_io_args_32) +#define BTRFS_IOC_ENCODED_WRITE_32 _IOW(BTRFS_IOCTL_MAGIC, 64, \ + struct btrfs_ioctl_encoded_io_args_32) #endif /* Mask out flags that are inappropriate for the given type of inode. */ @@ -440,10 +459,8 @@ void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, } } -static int btrfs_ioctl_getversion(struct file *file, int __user *arg) +static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg) { - struct inode *inode = file_inode(file); - return put_user(inode->i_generation, arg); } @@ -753,6 +770,13 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct btrfs_trans_handle *trans; int ret; + /* We do not support snapshotting right now. */ + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_warn(fs_info, + "extent tree v2 doesn't support snapshotting yet"); + return -EOPNOTSUPP; + } + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return -EINVAL; @@ -805,10 +829,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, goto fail; } - spin_lock(&fs_info->trans_lock); - list_add(&pending_snapshot->list, - &trans->transaction->pending_snapshots); - spin_unlock(&fs_info->trans_lock); + trans->pending_snapshot = pending_snapshot; ret = btrfs_commit_transaction(trans); if (ret) @@ -1015,8 +1036,155 @@ out: return ret; } +/* + * Defrag specific helper to get an extent map. + * + * Differences between this and btrfs_get_extent() are: + * + * - No extent_map will be added to inode->extent_tree + * To reduce memory usage in the long run. + * + * - Extra optimization to skip file extents older than @newer_than + * By using btrfs_search_forward() we can skip entire file ranges that + * have extents created in past transactions, because btrfs_search_forward() + * will not visit leaves and nodes with a generation smaller than given + * minimal generation threshold (@newer_than). + * + * Return valid em if we find a file extent matching the requirement. + * Return NULL if we can not find a file extent matching the requirement. + * + * Return ERR_PTR() for error. + */ +static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, + u64 start, u64 newer_than) +{ + struct btrfs_root *root = inode->root; + struct btrfs_file_extent_item *fi; + struct btrfs_path path = { 0 }; + struct extent_map *em; + struct btrfs_key key; + u64 ino = btrfs_ino(inode); + int ret; + + em = alloc_extent_map(); + if (!em) { + ret = -ENOMEM; + goto err; + } + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + + if (newer_than) { + ret = btrfs_search_forward(root, &key, &path, newer_than); + if (ret < 0) + goto err; + /* Can't find anything newer */ + if (ret > 0) + goto not_found; + } else { + ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); + if (ret < 0) + goto err; + } + if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) { + /* + * If btrfs_search_slot() makes path to point beyond nritems, + * we should not have an empty leaf, as this inode must at + * least have its INODE_ITEM. + */ + ASSERT(btrfs_header_nritems(path.nodes[0])); + path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1; + } + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + /* Perfect match, no need to go one slot back */ + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY && + key.offset == start) + goto iterate; + + /* We didn't find a perfect match, needs to go one slot back */ + if (path.slots[0] > 0) { + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) + path.slots[0]--; + } + +iterate: + /* Iterate through the path to find a file extent covering @start */ + while (true) { + u64 extent_end; + + if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) + goto next; + + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + + /* + * We may go one slot back to INODE_REF/XATTR item, then + * need to go forward until we reach an EXTENT_DATA. + * But we should still has the correct ino as key.objectid. + */ + if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) + goto next; + + /* It's beyond our target range, definitely not extent found */ + if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY) + goto not_found; + + /* + * | |<- File extent ->| + * \- start + * + * This means there is a hole between start and key.offset. + */ + if (key.offset > start) { + em->start = start; + em->orig_start = start; + em->block_start = EXTENT_MAP_HOLE; + em->len = key.offset - start; + break; + } + + fi = btrfs_item_ptr(path.nodes[0], path.slots[0], + struct btrfs_file_extent_item); + extent_end = btrfs_file_extent_end(&path); + + /* + * |<- file extent ->| | + * \- start + * + * We haven't reached start, search next slot. + */ + if (extent_end <= start) + goto next; + + /* Now this extent covers @start, convert it to em */ + btrfs_extent_item_to_extent_map(inode, &path, fi, false, em); + break; +next: + ret = btrfs_next_item(root, &path); + if (ret < 0) + goto err; + if (ret > 0) + goto not_found; + } + btrfs_release_path(&path); + return em; + +not_found: + btrfs_release_path(&path); + free_extent_map(em); + return NULL; + +err: + btrfs_release_path(&path); + free_extent_map(em); + return ERR_PTR(ret); +} + static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, - bool locked) + u64 newer_than, bool locked) { struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -1031,6 +1199,20 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, em = lookup_extent_mapping(em_tree, start, sectorsize); read_unlock(&em_tree->lock); + /* + * We can get a merged extent, in that case, we need to re-search + * tree to get the original em for defrag. + * + * If @newer_than is 0 or em::generation < newer_than, we can trust + * this em, as either we don't care about the generation, or the + * merged extent map will be rejected anyway. + */ + if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) && + newer_than && em->generation >= newer_than) { + free_extent_map(em); + em = NULL; + } + if (!em) { struct extent_state *cached = NULL; u64 end = start + sectorsize - 1; @@ -1038,7 +1220,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, /* get the big lock and read metadata off disk */ if (!locked) lock_extent_bits(io_tree, start, end, &cached); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, sectorsize); + em = defrag_get_extent(BTRFS_I(inode), start, newer_than); if (!locked) unlock_extent_cached(io_tree, start, end, &cached); @@ -1049,23 +1231,42 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, return em; } +static u32 get_extent_max_capacity(const struct extent_map *em) +{ + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + return BTRFS_MAX_COMPRESSED; + return BTRFS_MAX_EXTENT_SIZE; +} + static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, bool locked) { struct extent_map *next; - bool ret = true; + bool ret = false; /* this is the last extent */ if (em->start + em->len >= i_size_read(inode)) return false; - next = defrag_lookup_extent(inode, em->start + em->len, locked); + /* + * We want to check if the next extent can be merged with the current + * one, which can be an extent created in a past generation, so we pass + * a minimum generation of 0 to defrag_lookup_extent(). + */ + next = defrag_lookup_extent(inode, em->start + em->len, 0, locked); + /* No more em or hole */ if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) - ret = false; - else if ((em->block_start + em->block_len == next->block_start) && - (em->block_len > SZ_128K && next->block_len > SZ_128K)) - ret = false; - + goto out; + if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags)) + goto out; + /* + * If the next extent is at its max capacity, defragging current extent + * makes no sense, as the total number of extents won't change. + */ + if (next->len >= get_extent_max_capacity(em)) + goto out; + ret = true; +out: free_extent_map(next); return ret; } @@ -1189,8 +1390,10 @@ struct defrag_target_range { static int defrag_collect_targets(struct btrfs_inode *inode, u64 start, u64 len, u32 extent_thresh, u64 newer_than, bool do_compress, - bool locked, struct list_head *target_list) + bool locked, struct list_head *target_list, + u64 *last_scanned_ret) { + bool last_is_target = false; u64 cur = start; int ret = 0; @@ -1200,7 +1403,9 @@ static int defrag_collect_targets(struct btrfs_inode *inode, bool next_mergeable = true; u64 range_len; - em = defrag_lookup_extent(&inode->vfs_inode, cur, locked); + last_is_target = false; + em = defrag_lookup_extent(&inode->vfs_inode, cur, + newer_than, locked); if (!em) break; @@ -1213,6 +1418,39 @@ static int defrag_collect_targets(struct btrfs_inode *inode, if (em->generation < newer_than) goto next; + /* This em is under writeback, no need to defrag */ + if (em->generation == (u64)-1) + goto next; + + /* + * Our start offset might be in the middle of an existing extent + * map, so take that into account. + */ + range_len = em->len - (cur - em->start); + /* + * If this range of the extent map is already flagged for delalloc, + * skip it, because: + * + * 1) We could deadlock later, when trying to reserve space for + * delalloc, because in case we can't immediately reserve space + * the flusher can start delalloc and wait for the respective + * ordered extents to complete. The deadlock would happen + * because we do the space reservation while holding the range + * locked, and starting writeback, or finishing an ordered + * extent, requires locking the range; + * + * 2) If there's delalloc there, it means there's dirty pages for + * which writeback has not started yet (we clean the delalloc + * flag when starting writeback and after creating an ordered + * extent). If we mark pages in an adjacent range for defrag, + * then we will have a larger contiguous range for delalloc, + * very likely resulting in a larger extent after writeback is + * triggered (except in a case of free space fragmentation). + */ + if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1, + EXTENT_DELALLOC, 0, NULL)) + goto next; + /* * For do_compress case, we want to compress all valid file * extents, thus no @extent_thresh or mergeable check. @@ -1221,7 +1459,14 @@ static int defrag_collect_targets(struct btrfs_inode *inode, goto add; /* Skip too large extent */ - if (em->len >= extent_thresh) + if (range_len >= extent_thresh) + goto next; + + /* + * Skip extents already at its max capacity, this is mostly for + * compressed extents, which max cap is only 128K. + */ + if (em->len >= get_extent_max_capacity(em)) goto next; next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, @@ -1242,6 +1487,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, } add: + last_is_target = true; range_len = min(extent_map_end(em), start + len) - cur; /* * This one is a good target, check if it can be merged into @@ -1285,10 +1531,22 @@ next: kfree(entry); } } + if (!ret && last_scanned_ret) { + /* + * If the last extent is not a target, the caller can skip to + * the end of that extent. + * Otherwise, we can only go the end of the specified range. + */ + if (!last_is_target) + *last_scanned_ret = max(cur, *last_scanned_ret); + else + *last_scanned_ret = max(start + len, *last_scanned_ret); + } return ret; } #define CLUSTER_SIZE (SZ_256K) +static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); /* * Defrag one contiguous target range. @@ -1343,7 +1601,8 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, } static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, - u32 extent_thresh, u64 newer_than, bool do_compress) + u32 extent_thresh, u64 newer_than, bool do_compress, + u64 *last_scanned_ret) { struct extent_state *cached_state = NULL; struct defrag_target_range *entry; @@ -1389,7 +1648,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, */ ret = defrag_collect_targets(inode, start, len, extent_thresh, newer_than, do_compress, true, - &target_list); + &target_list, last_scanned_ret); if (ret < 0) goto unlock_extent; @@ -1424,7 +1683,8 @@ static int defrag_one_cluster(struct btrfs_inode *inode, u64 start, u32 len, u32 extent_thresh, u64 newer_than, bool do_compress, unsigned long *sectors_defragged, - unsigned long max_sectors) + unsigned long max_sectors, + u64 *last_scanned_ret) { const u32 sectorsize = inode->root->fs_info->sectorsize; struct defrag_target_range *entry; @@ -1432,24 +1692,34 @@ static int defrag_one_cluster(struct btrfs_inode *inode, LIST_HEAD(target_list); int ret; - BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); ret = defrag_collect_targets(inode, start, len, extent_thresh, newer_than, do_compress, false, - &target_list); + &target_list, NULL); if (ret < 0) goto out; list_for_each_entry(entry, &target_list, list) { u32 range_len = entry->len; - /* Reached the limit */ - if (max_sectors && max_sectors == *sectors_defragged) + /* Reached or beyond the limit */ + if (max_sectors && *sectors_defragged >= max_sectors) { + ret = 1; break; + } if (max_sectors) range_len = min_t(u32, range_len, (max_sectors - *sectors_defragged) * sectorsize); + /* + * If defrag_one_range() has updated last_scanned_ret, + * our range may already be invalid (e.g. hole punched). + * Skip if our range is before last_scanned_ret, as there is + * no need to defrag the range anymore. + */ + if (entry->start + range_len <= *last_scanned_ret) + continue; + if (ra) page_cache_sync_readahead(inode->vfs_inode.i_mapping, ra, NULL, entry->start >> PAGE_SHIFT, @@ -1462,16 +1732,20 @@ static int defrag_one_cluster(struct btrfs_inode *inode, * accounting. */ ret = defrag_one_range(inode, entry->start, range_len, - extent_thresh, newer_than, do_compress); + extent_thresh, newer_than, do_compress, + last_scanned_ret); if (ret < 0) break; - *sectors_defragged += range_len; + *sectors_defragged += range_len >> + inode->root->fs_info->sectorsize_bits; } out: list_for_each_entry_safe(entry, tmp, &target_list, list) { list_del_init(&entry->list); kfree(entry); } + if (ret >= 0) + *last_scanned_ret = max(*last_scanned_ret, start + len); return ret; } @@ -1484,6 +1758,12 @@ out: * @newer_than: minimum transid to defrag * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode * will be defragged. + * + * Return <0 for error. + * Return >=0 for the number of sectors defragged, and range->start will be updated + * to indicate the file offset where next defrag should be started at. + * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without + * defragging all the range). */ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, @@ -1499,6 +1779,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, int compress_type = BTRFS_COMPRESS_ZLIB; int ret = 0; u32 extent_thresh = range->extent_thresh; + pgoff_t start_index; if (isize == 0) return 0; @@ -1518,12 +1799,16 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, if (range->start + range->len > range->start) { /* Got a specific range */ - last_byte = min(isize, range->start + range->len) - 1; + last_byte = min(isize, range->start + range->len); } else { /* Defrag until file end */ - last_byte = isize - 1; + last_byte = isize; } + /* Align the range */ + cur = round_down(range->start, fs_info->sectorsize); + last_byte = round_up(last_byte, fs_info->sectorsize) - 1; + /* * If we were not given a ra, allocate a readahead context. As * readahead is just an optimization, defrag will work without it so @@ -1536,15 +1821,23 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, file_ra_state_init(ra, inode->i_mapping); } - /* Align the range */ - cur = round_down(range->start, fs_info->sectorsize); - last_byte = round_up(last_byte, fs_info->sectorsize) - 1; + /* + * Make writeback start from the beginning of the range, so that the + * defrag range can be written sequentially. + */ + start_index = cur >> PAGE_SHIFT; + if (start_index < inode->i_mapping->writeback_index) + inode->i_mapping->writeback_index = start_index; while (cur < last_byte) { + const unsigned long prev_sectors_defragged = sectors_defragged; + u64 last_scanned = cur; u64 cluster_end; - /* The cluster size 256K should always be page aligned */ - BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); + if (btrfs_defrag_cancelled(fs_info)) { + ret = -EAGAIN; + break; + } /* We want the cluster end at page boundary when possible */ cluster_end = (((cur >> PAGE_SHIFT) + @@ -1565,16 +1858,30 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, BTRFS_I(inode)->defrag_compress = compress_type; ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, cluster_end + 1 - cur, extent_thresh, - newer_than, do_compress, - §ors_defragged, max_to_defrag); + newer_than, do_compress, §ors_defragged, + max_to_defrag, &last_scanned); + + if (sectors_defragged > prev_sectors_defragged) + balance_dirty_pages_ratelimited(inode->i_mapping); + btrfs_inode_unlock(inode, 0); if (ret < 0) break; - cur = cluster_end + 1; + cur = max(cluster_end + 1, last_scanned); + if (ret > 0) { + ret = 0; + break; + } + cond_resched(); } if (ra_allocated) kfree(ra); + /* + * Update range.start for autodefrag, this will indicate where to start + * in next run. + */ + range->start = cur; if (sectors_defragged) { /* * We have defragged some sectors, for compression case they @@ -1943,10 +2250,9 @@ free_args: return ret; } -static noinline int btrfs_ioctl_subvol_getflags(struct file *file, +static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode, void __user *arg) { - struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; @@ -2276,12 +2582,11 @@ err: return ret; } -static noinline int btrfs_ioctl_tree_search(struct file *file, - void __user *argp) +static noinline int btrfs_ioctl_tree_search(struct inode *inode, + void __user *argp) { struct btrfs_ioctl_search_args __user *uargs; struct btrfs_ioctl_search_key sk; - struct inode *inode; int ret; size_t buf_size; @@ -2295,7 +2600,6 @@ static noinline int btrfs_ioctl_tree_search(struct file *file, buf_size = sizeof(uargs->buf); - inode = file_inode(file); ret = search_ioctl(inode, &sk, &buf_size, uargs->buf); /* @@ -2310,12 +2614,11 @@ static noinline int btrfs_ioctl_tree_search(struct file *file, return ret; } -static noinline int btrfs_ioctl_tree_search_v2(struct file *file, +static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode, void __user *argp) { struct btrfs_ioctl_search_args_v2 __user *uarg; struct btrfs_ioctl_search_args_v2 args; - struct inode *inode; int ret; size_t buf_size; const size_t buf_limit = SZ_16M; @@ -2334,7 +2637,6 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file, if (buf_size > buf_limit) buf_size = buf_limit; - inode = file_inode(file); ret = search_ioctl(inode, &args.key, &buf_size, (char __user *)(&uarg->buf[0])); if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key))) @@ -2585,25 +2887,22 @@ out: return ret; } -static noinline int btrfs_ioctl_ino_lookup(struct file *file, +static noinline int btrfs_ioctl_ino_lookup(struct btrfs_root *root, void __user *argp) { struct btrfs_ioctl_ino_lookup_args *args; - struct inode *inode; int ret = 0; args = memdup_user(argp, sizeof(*args)); if (IS_ERR(args)) return PTR_ERR(args); - inode = file_inode(file); - /* * Unprivileged query to obtain the containing subvolume root id. The * path is reset so it's consistent with btrfs_search_path_in_tree. */ if (args->treeid == 0) - args->treeid = BTRFS_I(inode)->root->root_key.objectid; + args->treeid = root->root_key.objectid; if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) { args->name[0] = 0; @@ -2615,7 +2914,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file, goto out; } - ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info, + ret = btrfs_search_path_in_tree(root->fs_info, args->treeid, args->objectid, args->name); @@ -2671,7 +2970,7 @@ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp) } /* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */ -static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) +static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp) { struct btrfs_ioctl_get_subvol_info_args *subvol_info; struct btrfs_fs_info *fs_info; @@ -2683,7 +2982,6 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) struct extent_buffer *leaf; unsigned long item_off; unsigned long item_len; - struct inode *inode; int slot; int ret = 0; @@ -2697,7 +2995,6 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) return -ENOMEM; } - inode = file_inode(file); fs_info = BTRFS_I(inode)->root->fs_info; /* Get root_item of inode's subvolume */ @@ -2791,15 +3088,14 @@ out_free: * Return ROOT_REF information of the subvolume containing this inode * except the subvolume name. */ -static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp) +static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root, + void __user *argp) { struct btrfs_ioctl_get_subvol_rootref_args *rootrefs; struct btrfs_root_ref *rref; - struct btrfs_root *root; struct btrfs_path *path; struct btrfs_key key; struct extent_buffer *leaf; - struct inode *inode; u64 objectid; int slot; int ret; @@ -2815,15 +3111,13 @@ static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp) return PTR_ERR(rootrefs); } - inode = file_inode(file); - root = BTRFS_I(inode)->root->fs_info->tree_root; - objectid = BTRFS_I(inode)->root->root_key.objectid; - + objectid = root->root_key.objectid; key.objectid = objectid; key.type = BTRFS_ROOT_REF_KEY; key.offset = rootrefs->min_treeid; found = 0; + root = root->fs_info->tree_root; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { goto out; @@ -2903,6 +3197,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, int err = 0; bool destroy_parent = false; + /* We don't support snapshots with extent tree v2 yet. */ + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, + "extent tree v2 doesn't support snapshot deletion yet"); + return -EOPNOTSUPP; + } + if (destroy_v2) { vol_args2 = memdup_user(arg, sizeof(*vol_args2)); if (IS_ERR(vol_args2)) @@ -3086,10 +3387,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, btrfs_inode_lock(inode, 0); err = btrfs_delete_subvolume(dir, dentry); btrfs_inode_unlock(inode, 0); - if (!err) { - fsnotify_rmdir(dir, dentry); - d_delete(dentry); - } + if (!err) + d_delete_notify(dir, dentry); out_dput: dput(dentry); @@ -3180,6 +3479,11 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, "device add not supported on extent tree v2 yet"); + return -EINVAL; + } + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) { if (!btrfs_exclop_start_try_lock(fs_info, BTRFS_EXCLOP_DEV_ADD)) return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; @@ -3290,7 +3594,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) struct block_device *bdev = NULL; fmode_t mode; int ret; - bool cancel; + bool cancel = false; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -3705,6 +4009,11 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, "scrub is not supported on extent tree v2 yet"); + return -EINVAL; + } + sa = memdup_user(arg, sizeof(*sa)); if (IS_ERR(sa)) return PTR_ERR(sa); @@ -3804,6 +4113,11 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info, if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, "device replace not supported on extent tree v2 yet"); + return -EINVAL; + } + p = memdup_user(arg, sizeof(*p)); if (IS_ERR(p)) return PTR_ERR(p); @@ -4865,7 +5179,7 @@ out_drop_write: return ret; } -static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat) +static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat) { struct btrfs_ioctl_send_args *arg; int ret; @@ -4895,11 +5209,194 @@ static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat) if (IS_ERR(arg)) return PTR_ERR(arg); } - ret = btrfs_ioctl_send(file, arg); + ret = btrfs_ioctl_send(inode, arg); kfree(arg); return ret; } +static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, + bool compat) +{ + struct btrfs_ioctl_encoded_io_args args = { 0 }; + size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, + flags); + size_t copy_end; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + loff_t pos; + struct kiocb kiocb; + ssize_t ret; + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_acct; + } + + if (compat) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_encoded_io_args_32 args32; + + copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, + flags); + if (copy_from_user(&args32, argp, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + args.iov = compat_ptr(args32.iov); + args.iovcnt = args32.iovcnt; + args.offset = args32.offset; + args.flags = args32.flags; +#else + return -ENOTTY; +#endif + } else { + copy_end = copy_end_kernel; + if (copy_from_user(&args, argp, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + } + if (args.flags != 0) { + ret = -EINVAL; + goto out_acct; + } + + ret = import_iovec(READ, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), + &iov, &iter); + if (ret < 0) + goto out_acct; + + if (iov_iter_count(&iter) == 0) { + ret = 0; + goto out_iov; + } + pos = args.offset; + ret = rw_verify_area(READ, file, &pos, args.len); + if (ret < 0) + goto out_iov; + + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = pos; + + ret = btrfs_encoded_read(&kiocb, &iter, &args); + if (ret >= 0) { + fsnotify_access(file); + if (copy_to_user(argp + copy_end, + (char *)&args + copy_end_kernel, + sizeof(args) - copy_end_kernel)) + ret = -EFAULT; + } + +out_iov: + kfree(iov); +out_acct: + if (ret > 0) + add_rchar(current, ret); + inc_syscr(current); + return ret; +} + +static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool compat) +{ + struct btrfs_ioctl_encoded_io_args args; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + loff_t pos; + struct kiocb kiocb; + ssize_t ret; + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_acct; + } + + if (!(file->f_mode & FMODE_WRITE)) { + ret = -EBADF; + goto out_acct; + } + + if (compat) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_encoded_io_args_32 args32; + + if (copy_from_user(&args32, argp, sizeof(args32))) { + ret = -EFAULT; + goto out_acct; + } + args.iov = compat_ptr(args32.iov); + args.iovcnt = args32.iovcnt; + args.offset = args32.offset; + args.flags = args32.flags; + args.len = args32.len; + args.unencoded_len = args32.unencoded_len; + args.unencoded_offset = args32.unencoded_offset; + args.compression = args32.compression; + args.encryption = args32.encryption; + memcpy(args.reserved, args32.reserved, sizeof(args.reserved)); +#else + return -ENOTTY; +#endif + } else { + if (copy_from_user(&args, argp, sizeof(args))) { + ret = -EFAULT; + goto out_acct; + } + } + + ret = -EINVAL; + if (args.flags != 0) + goto out_acct; + if (memchr_inv(args.reserved, 0, sizeof(args.reserved))) + goto out_acct; + if (args.compression == BTRFS_ENCODED_IO_COMPRESSION_NONE && + args.encryption == BTRFS_ENCODED_IO_ENCRYPTION_NONE) + goto out_acct; + if (args.compression >= BTRFS_ENCODED_IO_COMPRESSION_TYPES || + args.encryption >= BTRFS_ENCODED_IO_ENCRYPTION_TYPES) + goto out_acct; + if (args.unencoded_offset > args.unencoded_len) + goto out_acct; + if (args.len > args.unencoded_len - args.unencoded_offset) + goto out_acct; + + ret = import_iovec(WRITE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), + &iov, &iter); + if (ret < 0) + goto out_acct; + + file_start_write(file); + + if (iov_iter_count(&iter) == 0) { + ret = 0; + goto out_end_write; + } + pos = args.offset; + ret = rw_verify_area(WRITE, file, &pos, args.len); + if (ret < 0) + goto out_end_write; + + init_sync_kiocb(&kiocb, file); + ret = kiocb_set_rw_flags(&kiocb, 0); + if (ret) + goto out_end_write; + kiocb.ki_pos = pos; + + ret = btrfs_do_write_iter(&kiocb, &iter, &args); + if (ret > 0) + fsnotify_modify(file); + +out_end_write: + file_end_write(file); + kfree(iov); +out_acct: + if (ret > 0) + add_wchar(current, ret); + inc_syscw(current); + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -4910,7 +5407,7 @@ long btrfs_ioctl(struct file *file, unsigned int switch (cmd) { case FS_IOC_GETVERSION: - return btrfs_ioctl_getversion(file, argp); + return btrfs_ioctl_getversion(inode, argp); case FS_IOC_GETFSLABEL: return btrfs_ioctl_get_fslabel(fs_info, argp); case FS_IOC_SETFSLABEL: @@ -4930,7 +5427,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SNAP_DESTROY_V2: return btrfs_ioctl_snap_destroy(file, argp, true); case BTRFS_IOC_SUBVOL_GETFLAGS: - return btrfs_ioctl_subvol_getflags(file, argp); + return btrfs_ioctl_subvol_getflags(inode, argp); case BTRFS_IOC_SUBVOL_SETFLAGS: return btrfs_ioctl_subvol_setflags(file, argp); case BTRFS_IOC_DEFAULT_SUBVOL: @@ -4954,11 +5451,11 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_BALANCE: return btrfs_ioctl_balance(file, NULL); case BTRFS_IOC_TREE_SEARCH: - return btrfs_ioctl_tree_search(file, argp); + return btrfs_ioctl_tree_search(inode, argp); case BTRFS_IOC_TREE_SEARCH_V2: - return btrfs_ioctl_tree_search_v2(file, argp); + return btrfs_ioctl_tree_search_v2(inode, argp); case BTRFS_IOC_INO_LOOKUP: - return btrfs_ioctl_ino_lookup(file, argp); + return btrfs_ioctl_ino_lookup(root, argp); case BTRFS_IOC_INO_PATHS: return btrfs_ioctl_ino_to_path(root, argp); case BTRFS_IOC_LOGICAL_INO: @@ -5005,10 +5502,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_set_received_subvol_32(file, argp); #endif case BTRFS_IOC_SEND: - return _btrfs_ioctl_send(file, argp, false); + return _btrfs_ioctl_send(inode, argp, false); #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) case BTRFS_IOC_SEND_32: - return _btrfs_ioctl_send(file, argp, true); + return _btrfs_ioctl_send(inode, argp, true); #endif case BTRFS_IOC_GET_DEV_STATS: return btrfs_ioctl_get_dev_stats(fs_info, argp); @@ -5035,15 +5532,25 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SET_FEATURES: return btrfs_ioctl_set_features(file, argp); case BTRFS_IOC_GET_SUBVOL_INFO: - return btrfs_ioctl_get_subvol_info(file, argp); + return btrfs_ioctl_get_subvol_info(inode, argp); case BTRFS_IOC_GET_SUBVOL_ROOTREF: - return btrfs_ioctl_get_subvol_rootref(file, argp); + return btrfs_ioctl_get_subvol_rootref(root, argp); case BTRFS_IOC_INO_LOOKUP_USER: return btrfs_ioctl_ino_lookup_user(file, argp); case FS_IOC_ENABLE_VERITY: return fsverity_ioctl_enable(file, (const void __user *)argp); case FS_IOC_MEASURE_VERITY: return fsverity_ioctl_measure(file, argp); + case BTRFS_IOC_ENCODED_READ: + return btrfs_ioctl_encoded_read(file, argp, false); + case BTRFS_IOC_ENCODED_WRITE: + return btrfs_ioctl_encoded_write(file, argp, false); +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + case BTRFS_IOC_ENCODED_READ_32: + return btrfs_ioctl_encoded_read(file, argp, true); + case BTRFS_IOC_ENCODED_WRITE_32: + return btrfs_ioctl_encoded_write(file, argp, true); +#endif } return -ENOTTY; diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 0fb90cbe7669..430ad36b8b08 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -55,6 +55,9 @@ * 0x1000 | SegHdr N+1| Data payload N+1 ... | */ +#define WORKSPACE_BUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE)) +#define WORKSPACE_CBUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE)) + struct workspace { void *mem; void *buf; /* where decompressed data goes */ @@ -83,8 +86,8 @@ struct list_head *lzo_alloc_workspace(unsigned int level) return ERR_PTR(-ENOMEM); workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); - workspace->buf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL); - workspace->cbuf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL); + workspace->buf = kvmalloc(WORKSPACE_BUF_LENGTH, GFP_KERNEL); + workspace->cbuf = kvmalloc(WORKSPACE_CBUF_LENGTH, GFP_KERNEL); if (!workspace->mem || !workspace->buf || !workspace->cbuf) goto fail; @@ -380,6 +383,17 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) kunmap(cur_page); cur_in += LZO_LEN; + if (seg_len > WORKSPACE_CBUF_LENGTH) { + /* + * seg_len shouldn't be larger than we have allocated + * for workspace->cbuf + */ + btrfs_err(fs_info, "unexpectedly large lzo segment len %u", + seg_len); + ret = -EIO; + goto out; + } + /* Copy the compressed segment payload into workspace */ copy_compressed_segment(cb, workspace->cbuf, seg_len, &cur_in); @@ -422,7 +436,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in, struct workspace *workspace = list_entry(ws, struct workspace, list); size_t in_len; size_t out_len; - size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE); + size_t max_segment_len = WORKSPACE_BUF_LENGTH; int ret = 0; char *kaddr; unsigned long bytes; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 6b51fd2ec5ac..1957b14b329a 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -143,16 +143,28 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, return ret; } -/* - * Allocate and add a new ordered_extent into the per-inode tree. +/** + * Add an ordered extent to the per-inode tree. + * + * @inode: Inode that this extent is for. + * @file_offset: Logical offset in file where the extent starts. + * @num_bytes: Logical length of extent in file. + * @ram_bytes: Full length of unencoded data. + * @disk_bytenr: Offset of extent on disk. + * @disk_num_bytes: Size of extent on disk. + * @offset: Offset into unencoded data where file data starts. + * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*). + * @compress_type: Compression algorithm used for data. * - * The tree is given a single reference on the ordered extent that was - * inserted. + * Most of these parameters correspond to &struct btrfs_file_extent_item. The + * tree is given a single reference on the ordered extent that was inserted. + * + * Return: 0 or -ENOMEM. */ -static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, - u64 disk_bytenr, u64 num_bytes, - u64 disk_num_bytes, int type, int dio, - int compress_type) +int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, + u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, + u64 disk_num_bytes, u64 offset, unsigned flags, + int compress_type) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -161,7 +173,8 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset struct btrfs_ordered_extent *entry; int ret; - if (type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_PREALLOC) { + if (flags & + ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) { /* For nocow write, we can release the qgroup rsv right now */ ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes); if (ret < 0) @@ -181,9 +194,11 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset return -ENOMEM; entry->file_offset = file_offset; - entry->disk_bytenr = disk_bytenr; entry->num_bytes = num_bytes; + entry->ram_bytes = ram_bytes; + entry->disk_bytenr = disk_bytenr; entry->disk_num_bytes = disk_num_bytes; + entry->offset = offset; entry->bytes_left = num_bytes; entry->inode = igrab(&inode->vfs_inode); entry->compress_type = compress_type; @@ -191,18 +206,12 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset entry->qgroup_rsv = ret; entry->physical = (u64)-1; - ASSERT(type == BTRFS_ORDERED_REGULAR || - type == BTRFS_ORDERED_NOCOW || - type == BTRFS_ORDERED_PREALLOC || - type == BTRFS_ORDERED_COMPRESSED); - set_bit(type, &entry->flags); + ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0); + entry->flags = flags; percpu_counter_add_batch(&fs_info->ordered_bytes, num_bytes, fs_info->delalloc_batch); - if (dio) - set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); - /* one ref for the tree */ refcount_set(&entry->refs, 1); init_waitqueue_head(&entry->wait); @@ -247,41 +256,6 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset return 0; } -int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, - u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, - int type) -{ - ASSERT(type == BTRFS_ORDERED_REGULAR || - type == BTRFS_ORDERED_NOCOW || - type == BTRFS_ORDERED_PREALLOC); - return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, - num_bytes, disk_num_bytes, type, 0, - BTRFS_COMPRESS_NONE); -} - -int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset, - u64 disk_bytenr, u64 num_bytes, - u64 disk_num_bytes, int type) -{ - ASSERT(type == BTRFS_ORDERED_REGULAR || - type == BTRFS_ORDERED_NOCOW || - type == BTRFS_ORDERED_PREALLOC); - return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, - num_bytes, disk_num_bytes, type, 1, - BTRFS_COMPRESS_NONE); -} - -int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset, - u64 disk_bytenr, u64 num_bytes, - u64 disk_num_bytes, int compress_type) -{ - ASSERT(compress_type != BTRFS_COMPRESS_NONE); - return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, - num_bytes, disk_num_bytes, - BTRFS_ORDERED_COMPRESSED, 0, - compress_type); -} - /* * Add a struct btrfs_ordered_sum into the list of checksums to be inserted * when an ordered extent is finished. If the list covers more than one @@ -548,9 +522,15 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, spin_lock(&btrfs_inode->lock); btrfs_mod_outstanding_extents(btrfs_inode, -1); spin_unlock(&btrfs_inode->lock); - if (root != fs_info->tree_root) - btrfs_delalloc_release_metadata(btrfs_inode, entry->num_bytes, - false); + if (root != fs_info->tree_root) { + u64 release; + + if (test_bit(BTRFS_ORDERED_ENCODED, &entry->flags)) + release = entry->disk_num_bytes; + else + release = entry->num_bytes; + btrfs_delalloc_release_metadata(btrfs_inode, release, false); + } percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes, fs_info->delalloc_batch); @@ -1052,42 +1032,18 @@ static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos, struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; u64 file_offset = ordered->file_offset + pos; u64 disk_bytenr = ordered->disk_bytenr + pos; - u64 num_bytes = len; - u64 disk_num_bytes = len; - int type; - unsigned long flags_masked = ordered->flags & ~(1 << BTRFS_ORDERED_DIRECT); - int compress_type = ordered->compress_type; - unsigned long weight; - int ret; - - weight = hweight_long(flags_masked); - WARN_ON_ONCE(weight > 1); - if (!weight) - type = 0; - else - type = __ffs(flags_masked); + unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS; /* - * The splitting extent is already counted and will be added again - * in btrfs_add_ordered_extent_*(). Subtract num_bytes to avoid - * double counting. + * The splitting extent is already counted and will be added again in + * btrfs_add_ordered_extent_*(). Subtract len to avoid double counting. */ - percpu_counter_add_batch(&fs_info->ordered_bytes, -num_bytes, + percpu_counter_add_batch(&fs_info->ordered_bytes, -len, fs_info->delalloc_batch); - if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered->flags)) { - WARN_ON_ONCE(1); - ret = btrfs_add_ordered_extent_compress(BTRFS_I(inode), - file_offset, disk_bytenr, num_bytes, - disk_num_bytes, compress_type); - } else if (test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { - ret = btrfs_add_ordered_extent_dio(BTRFS_I(inode), file_offset, - disk_bytenr, num_bytes, disk_num_bytes, type); - } else { - ret = btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, - disk_bytenr, num_bytes, disk_num_bytes, type); - } - - return ret; + WARN_ON_ONCE(flags & (1 << BTRFS_ORDERED_COMPRESSED)); + return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len, + disk_bytenr, len, 0, flags, + ordered->compress_type); } int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 4194e960ff61..ecad67a2c745 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -74,8 +74,18 @@ enum { BTRFS_ORDERED_LOGGED_CSUM, /* We wait for this extent to complete in the current transaction */ BTRFS_ORDERED_PENDING, + /* BTRFS_IOC_ENCODED_WRITE */ + BTRFS_ORDERED_ENCODED, }; +/* BTRFS_ORDERED_* flags that specify the type of the extent. */ +#define BTRFS_ORDERED_TYPE_FLAGS ((1UL << BTRFS_ORDERED_REGULAR) | \ + (1UL << BTRFS_ORDERED_NOCOW) | \ + (1UL << BTRFS_ORDERED_PREALLOC) | \ + (1UL << BTRFS_ORDERED_COMPRESSED) | \ + (1UL << BTRFS_ORDERED_DIRECT) | \ + (1UL << BTRFS_ORDERED_ENCODED)) + struct btrfs_ordered_extent { /* logical offset in the file */ u64 file_offset; @@ -84,9 +94,11 @@ struct btrfs_ordered_extent { * These fields directly correspond to the same fields in * btrfs_file_extent_item. */ - u64 disk_bytenr; u64 num_bytes; + u64 ram_bytes; + u64 disk_bytenr; u64 disk_num_bytes; + u64 offset; /* number of bytes that still need writing */ u64 bytes_left; @@ -179,14 +191,9 @@ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size); int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, - u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, - int type); -int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset, - u64 disk_bytenr, u64 num_bytes, - u64 disk_num_bytes, int type); -int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset, - u64 disk_bytenr, u64 num_bytes, - u64 disk_num_bytes, int compress_type); + u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, + u64 disk_num_bytes, u64 offset, unsigned flags, + int compress_type); void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 0775ae9f4419..dd8777872143 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -23,6 +23,7 @@ static const struct root_name_map root_map[] = { { BTRFS_QUOTA_TREE_OBJECTID, "QUOTA_TREE" }, { BTRFS_UUID_TREE_OBJECTID, "UUID_TREE" }, { BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" }, + { BTRFS_BLOCK_GROUP_TREE_OBJECTID, "BLOCK_GROUP_TREE" }, { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" }, }; @@ -391,9 +392,9 @@ void btrfs_print_tree(struct extent_buffer *c, bool follow) btrfs_header_owner(c), btrfs_node_ptr_generation(c, i), level - 1, &first_key); - if (IS_ERR(next)) { + if (IS_ERR(next)) continue; - } else if (!extent_buffer_uptodate(next)) { + if (!extent_buffer_uptodate(next)) { free_extent_buffer(next); continue; } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 8928275823a1..1866b1f0da01 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -25,18 +25,6 @@ #include "sysfs.h" #include "tree-mod-log.h" -/* TODO XXX FIXME - * - subvol delete -> delete when ref goes to 0? delete limits also? - * - reorganize keys - * - compressed - * - sync - * - copy also limits on subvol creation - * - limit - * - caches for ulists - * - performance benchmarks - * - check all ioctl parameters - */ - /* * Helpers to access qgroup reservation * @@ -258,16 +246,19 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) return 0; } -/* must be called with qgroup_lock held */ -static int add_relation_rb(struct btrfs_fs_info *fs_info, - u64 memberid, u64 parentid) +/* + * Add relation specified by two qgroups. + * + * Must be called with qgroup_lock held. + * + * Return: 0 on success + * -ENOENT if one of the qgroups is NULL + * <0 other errors + */ +static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *parent) { - struct btrfs_qgroup *member; - struct btrfs_qgroup *parent; struct btrfs_qgroup_list *list; - member = find_qgroup_rb(fs_info, memberid); - parent = find_qgroup_rb(fs_info, parentid); if (!member || !parent) return -ENOENT; @@ -283,7 +274,27 @@ static int add_relation_rb(struct btrfs_fs_info *fs_info, return 0; } -/* must be called with qgroup_lock held */ +/* + * Add relation specified by two qgoup ids. + * + * Must be called with qgroup_lock held. + * + * Return: 0 on success + * -ENOENT if one of the ids does not exist + * <0 other errors + */ +static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid) +{ + struct btrfs_qgroup *member; + struct btrfs_qgroup *parent; + + member = find_qgroup_rb(fs_info, memberid); + parent = find_qgroup_rb(fs_info, parentid); + + return __add_relation_rb(member, parent); +} + +/* Must be called with qgroup_lock held */ static int del_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid) { @@ -948,6 +959,12 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) */ lockdep_assert_held_write(&fs_info->subvol_sem); + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, + "qgroups are currently unsupported in extent tree v2"); + return -EINVAL; + } + mutex_lock(&fs_info->qgroup_ioctl_lock); if (fs_info->quota_root) goto out; @@ -1185,12 +1202,34 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) struct btrfs_trans_handle *trans = NULL; int ret = 0; + /* + * We need to have subvol_sem write locked, to prevent races between + * concurrent tasks trying to disable quotas, because we will unlock + * and relock qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes. + */ + lockdep_assert_held_write(&fs_info->subvol_sem); + mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) goto out; + + /* + * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to + * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs + * to lock that mutex while holding a transaction handle and the rescan + * worker needs to commit a transaction. + */ mutex_unlock(&fs_info->qgroup_ioctl_lock); /* + * Request qgroup rescan worker to complete and wait for it. This wait + * must be done before transaction start for quota disable since it may + * deadlock with transaction by the qgroup rescan worker. + */ + clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + btrfs_qgroup_wait_for_completion(fs_info, false); + + /* * 1 For the root item * * We should also reserve enough items for the quota tree deletion in @@ -1205,14 +1244,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; + set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); goto out; } if (!fs_info->quota_root) goto out; - clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - btrfs_qgroup_wait_for_completion(fs_info, false); spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; fs_info->quota_root = NULL; @@ -1430,7 +1468,7 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, } spin_lock(&fs_info->qgroup_lock); - ret = add_relation_rb(fs_info, src, dst); + ret = __add_relation_rb(member, parent); if (ret < 0) { spin_unlock(&fs_info->qgroup_lock); goto out; @@ -3247,7 +3285,8 @@ out: static bool rescan_should_stop(struct btrfs_fs_info *fs_info) { return btrfs_fs_closing(fs_info) || - test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); + test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) || + !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); } static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) @@ -3277,11 +3316,9 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) err = PTR_ERR(trans); break; } - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { - err = -EINTR; - } else { - err = qgroup_rescan_leaf(trans, path); - } + + err = qgroup_rescan_leaf(trans, path); + if (err > 0) btrfs_commit_transaction(trans); else @@ -3295,7 +3332,7 @@ out: if (err > 0 && fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; - } else if (err < 0) { + } else if (err < 0 || stopped) { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; } mutex_unlock(&fs_info->qgroup_rescan_lock); @@ -3383,6 +3420,9 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, btrfs_warn(fs_info, "qgroup rescan init failed, qgroup is not enabled"); ret = -EINVAL; + } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { + /* Quota disable is in progress */ + ret = -EBUSY; } if (ret) { diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index a3930da4eb3f..04a88bfe4fcf 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -277,7 +277,7 @@ copy_inline_extent: path->slots[0]), size); btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found); - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags); + btrfs_set_inode_full_sync(BTRFS_I(dst)); ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end); out: if (!ret && !trans) { @@ -494,7 +494,8 @@ process_slot: &clone_info, &trans); if (ret) goto out; - } else if (type == BTRFS_FILE_EXTENT_INLINE) { + } else { + ASSERT(type == BTRFS_FILE_EXTENT_INLINE); /* * Inline extents always have to start at file offset 0 * and can never be bigger then the sector size. We can @@ -505,8 +506,12 @@ process_slot: */ ASSERT(key.offset == 0); ASSERT(datal <= fs_info->sectorsize); - if (key.offset != 0 || datal > fs_info->sectorsize) - return -EUCLEAN; + if (WARN_ON(type != BTRFS_FILE_EXTENT_INLINE) || + WARN_ON(key.offset != 0) || + WARN_ON(datal > fs_info->sectorsize)) { + ret = -EUCLEAN; + goto out; + } ret = clone_copy_inline_extent(inode, path, &new_key, drop_start, datal, size, @@ -518,17 +523,22 @@ process_slot: btrfs_release_path(path); /* - * If this is a new extent update the last_reflink_trans of both - * inodes. This is used by fsync to make sure it does not log - * multiple checksum items with overlapping ranges. For older - * extents we don't need to do it since inode logging skips the - * checksums for older extents. Also ignore holes and inline - * extents because they don't have checksums in the csum tree. + * Whenever we share an extent we update the last_reflink_trans + * of each inode to the current transaction. This is needed to + * make sure fsync does not log multiple checksum items with + * overlapping ranges (because some extent items might refer + * only to sections of the original extent). For the destination + * inode we do this regardless of the generation of the extents + * or even if they are inline extents or explicit holes, to make + * sure a full fsync does not skip them. For the source inode, + * we only need to update last_reflink_trans in case it's a new + * extent that is not a hole or an inline extent, to deal with + * the checksums problem on fsync. */ - if (extent_gen == trans->transid && disko > 0) { + if (extent_gen == trans->transid && disko > 0) BTRFS_I(src)->last_reflink_trans = trans->transid; - BTRFS_I(inode)->last_reflink_trans = trans->transid; - } + + BTRFS_I(inode)->last_reflink_trans = trans->transid; last_dest_end = ALIGN(new_key.offset + datal, fs_info->sectorsize); @@ -575,8 +585,7 @@ process_slot: * replaced file extent items. */ if (last_dest_end >= i_size_read(inode)) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + btrfs_set_inode_full_sync(BTRFS_I(inode)); ret = btrfs_replace_file_extents(BTRFS_I(inode), path, last_dest_end, destoff + len - 1, NULL, &trans); @@ -772,9 +781,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, if (btrfs_root_readonly(root_out)) return -EROFS; - if (file_in->f_path.mnt != file_out->f_path.mnt || - inode_in->i_sb != inode_out->i_sb) - return -EXDEV; + ASSERT(inode_in->i_sb == inode_out->i_sb); } /* Don't make the dst file partly checksummed */ diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index f5465197996d..fdc2c4b411f0 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2599,9 +2599,9 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info, eb = read_tree_block(fs_info, block->bytenr, block->owner, block->key.offset, block->level, NULL); - if (IS_ERR(eb)) { + if (IS_ERR(eb)) return PTR_ERR(eb); - } else if (!extent_buffer_uptodate(eb)) { + if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; } @@ -2997,7 +2997,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, /* Reserve metadata for this range */ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), - clamped_len); + clamped_len, clamped_len); if (ret) goto release_page; @@ -3960,6 +3960,19 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) int rw = 0; int err = 0; + /* + * This only gets set if we had a half-deleted snapshot on mount. We + * cannot allow relocation to start while we're still trying to clean up + * these pending deletions. + */ + ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE); + if (ret) + return ret; + + /* We may have been woken up by close_ctree, so bail if we're closing. */ + if (btrfs_fs_closing(fs_info)) + return -EINTR; + bg = btrfs_lookup_block_group(fs_info, group_start); if (!bg) return -ENOENT; @@ -4110,9 +4123,8 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) * this function resumes merging reloc trees with corresponding fs trees. * this is important for keeping the sharing of tree blocks */ -int btrfs_recover_relocation(struct btrfs_root *root) +int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = root->fs_info; LIST_HEAD(reloc_roots); struct btrfs_key key; struct btrfs_root *fs_root; @@ -4153,7 +4165,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) key.type != BTRFS_ROOT_ITEM_KEY) break; - reloc_root = btrfs_read_tree_root(root, &key); + reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key); if (IS_ERR(reloc_root)) { err = PTR_ERR(reloc_root); goto out; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 3d68d2dcd83e..ca7426ef61c8 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -278,6 +278,21 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)); if (btrfs_root_refs(&root->root_item) == 0) { + struct btrfs_key drop_key; + + btrfs_disk_key_to_cpu(&drop_key, &root->root_item.drop_progress); + /* + * If we have a non-zero drop_progress then we know we + * made it partly through deleting this snapshot, and + * thus we need to make sure we block any balance from + * happening until this snapshot is completely dropped. + */ + if (drop_key.objectid != 0 || drop_key.type != 0 || + drop_key.offset != 0) { + set_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); + set_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state); + } + set_bit(BTRFS_ROOT_DEAD_TREE, &root->state); btrfs_add_dead_root(root); } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 2e9a322773f2..11089568b287 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3190,7 +3190,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, u64 generation; int mirror_num; struct btrfs_key key; - u64 increment = map->stripe_len; + u64 increment; u64 offset; u64 extent_logical; u64 extent_physical; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d8ccb62aa7d2..7d1642937274 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -528,17 +528,12 @@ out: static int fs_path_copy(struct fs_path *p, struct fs_path *from) { - int ret; - p->reversed = from->reversed; fs_path_reset(p); - ret = fs_path_add_path(p, from); - - return ret; + return fs_path_add_path(p, from); } - static void fs_path_unreverse(struct fs_path *p) { char *tmp; @@ -4999,6 +4994,10 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) lock_page(page); if (!PageUptodate(page)) { unlock_page(page); + btrfs_err(fs_info, + "send: IO error at offset %llu for inode %llu root %llu", + page_offset(page), sctx->cur_ino, + sctx->send_root->root_key.objectid); put_page(page); ret = -EIO; break; @@ -7473,10 +7472,10 @@ static void dedupe_in_progress_warn(const struct btrfs_root *root) root->root_key.objectid, root->dedupe_in_progress); } -long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) +long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) { int ret = 0; - struct btrfs_root *send_root = BTRFS_I(file_inode(mnt_file))->root; + struct btrfs_root *send_root = BTRFS_I(inode)->root; struct btrfs_fs_info *fs_info = send_root->fs_info; struct btrfs_root *clone_root; struct send_ctx *sctx = NULL; diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 23bcefc84e49..08602fdd600a 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -126,7 +126,7 @@ enum { #define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1) #ifdef __KERNEL__ -long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg); +long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg); #endif #endif diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 294242c194d8..b87931a458eb 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -737,6 +737,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, u64 thresh = div_factor_fine(space_info->total_bytes, 90); u64 used; + lockdep_assert_held(&space_info->lock); + /* If we're just plain full then async reclaim just slows us down. */ if ((space_info->bytes_used + space_info->bytes_reserved + global_rsv_size) >= thresh) @@ -1061,7 +1063,6 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) trans_rsv->reserved; if (block_rsv_size < space_info->bytes_may_use) delalloc_size = space_info->bytes_may_use - block_rsv_size; - spin_unlock(&space_info->lock); /* * We don't want to include the global_rsv in our calculation, @@ -1092,6 +1093,8 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) flush = FLUSH_DELAYED_REFS_NR; } + spin_unlock(&space_info->lock); + /* * We don't want to reclaim everything, just a portion, so scale * down the to_reclaim by 1/4. If it takes us down to 0, diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 29bd8c7a7706..ef7ae20d2b77 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -736,7 +736,7 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, * Since we own the page lock, no one else could touch subpage::writers * and we are safe to do several atomic operations without spinlock. */ - if (atomic_read(&subpage->writers)) + if (atomic_read(&subpage->writers) == 0) /* No writers, locked by plain lock_page() */ return unlock_page(page); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 4d947ba32da9..b228efe8ab6e 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -66,6 +66,52 @@ static struct file_system_type btrfs_root_fs_type; static int btrfs_remount(struct super_block *sb, int *flags, char *data); +#ifdef CONFIG_PRINTK + +#define STATE_STRING_PREFACE ": state " +#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT) + +/* + * Characters to print to indicate error conditions or uncommon filesystem sate. + * RO is not an error. + */ +static const char fs_state_chars[] = { + [BTRFS_FS_STATE_ERROR] = 'E', + [BTRFS_FS_STATE_REMOUNTING] = 'M', + [BTRFS_FS_STATE_RO] = 0, + [BTRFS_FS_STATE_TRANS_ABORTED] = 'A', + [BTRFS_FS_STATE_DEV_REPLACING] = 'R', + [BTRFS_FS_STATE_DUMMY_FS_INFO] = 0, + [BTRFS_FS_STATE_NO_CSUMS] = 'C', + [BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L', +}; + +static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf) +{ + unsigned int bit; + bool states_printed = false; + unsigned long fs_state = READ_ONCE(info->fs_state); + char *curr = buf; + + memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE)); + curr += sizeof(STATE_STRING_PREFACE) - 1; + + for_each_set_bit(bit, &fs_state, sizeof(fs_state)) { + WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT); + if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) { + *curr++ = fs_state_chars[bit]; + states_printed = true; + } + } + + /* If no states were printed, reset the buffer */ + if (!states_printed) + curr = buf; + + *curr++ = 0; +} +#endif + /* * Generally the error codes correspond to their respective errors, but there * are a few special cases. @@ -128,6 +174,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function { struct super_block *sb = fs_info->sb; #ifdef CONFIG_PRINTK + char statestr[STATE_STRING_BUF_LEN]; const char *errstr; #endif @@ -140,6 +187,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function #ifdef CONFIG_PRINTK errstr = btrfs_decode_error(errno); + btrfs_state_to_string(fs_info, statestr); if (fmt) { struct va_format vaf; va_list args; @@ -148,12 +196,12 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function vaf.fmt = fmt; vaf.va = &args; - pr_crit("BTRFS: error (device %s) in %s:%d: errno=%d %s (%pV)\n", - sb->s_id, function, line, errno, errstr, &vaf); + pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n", + sb->s_id, statestr, function, line, errno, errstr, &vaf); va_end(args); } else { - pr_crit("BTRFS: error (device %s) in %s:%d: errno=%d %s\n", - sb->s_id, function, line, errno, errstr); + pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n", + sb->s_id, statestr, function, line, errno, errstr); } #endif @@ -240,11 +288,15 @@ void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, . vaf.va = &args; if (__ratelimit(ratelimit)) { - if (fs_info) - printk("%sBTRFS %s (device %s): %pV\n", lvl, type, - fs_info->sb->s_id, &vaf); - else + if (fs_info) { + char statestr[STATE_STRING_BUF_LEN]; + + btrfs_state_to_string(fs_info, statestr); + printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type, + fs_info->sb->s_id, statestr, &vaf); + } else { printk("%sBTRFS %s: %pV\n", lvl, type, &vaf); + } } va_end(args); @@ -861,6 +913,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_space_cache: case Opt_space_cache_version: + /* + * We already set FREE_SPACE_TREE above because we have + * compat_ro(FREE_SPACE_TREE) set, and we aren't going + * to allow v1 to be set for extent tree v2, simply + * ignore this setting if we're extent tree v2. + */ + if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) + break; if (token == Opt_space_cache || strcmp(args[0].from, "v1") == 0) { btrfs_clear_opt(info->mount_opt, @@ -881,6 +941,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE); break; case Opt_no_space_cache: + /* + * We cannot operate without the free space tree with + * extent tree v2, ignore this option. + */ + if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) + break; if (btrfs_test_opt(info, SPACE_CACHE)) { btrfs_clear_and_info(info, SPACE_CACHE, "disabling disk space caching"); @@ -896,6 +962,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, "the 'inode_cache' option is deprecated and has no effect since 5.11"); break; case Opt_clear_cache: + /* + * We cannot clear the free space tree with extent tree + * v2, ignore this option. + */ + if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) + break; btrfs_set_and_info(info, CLEAR_CACHE, "force clearing of disk cache"); break; @@ -2383,6 +2455,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, { struct btrfs_ioctl_vol_args *vol; struct btrfs_device *device = NULL; + dev_t devt = 0; int ret = -ENOTTY; if (!capable(CAP_SYS_ADMIN)) @@ -2402,7 +2475,12 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, mutex_unlock(&uuid_mutex); break; case BTRFS_IOC_FORGET_DEV: - ret = btrfs_forget_devices(vol->name); + if (vol->name[0] != 0) { + ret = lookup_bdev(vol->name, &devt); + if (ret) + break; + } + ret = btrfs_forget_devices(devt); break; case BTRFS_IOC_DEVICES_READY: mutex_lock(&uuid_mutex); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index beb7f72d50b8..17389a42a3ab 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -283,9 +283,11 @@ BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID); BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE); BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34); -/* Remove once support for zoned allocation is feature complete */ #ifdef CONFIG_BTRFS_DEBUG +/* Remove once support for zoned allocation is feature complete */ BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED); +/* Remove once support for extent tree v2 is feature complete */ +BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2); #endif #ifdef CONFIG_FS_VERITY BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY); @@ -314,6 +316,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(raid1c34), #ifdef CONFIG_BTRFS_DEBUG BTRFS_FEAT_ATTR_PTR(zoned), + BTRFS_FEAT_ATTR_PTR(extent_tree_v2), #endif #ifdef CONFIG_FS_VERITY BTRFS_FEAT_ATTR_PTR(verity), @@ -1104,6 +1107,11 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX]; static struct btrfs_feature_attr btrfs_feature_attrs[FEAT_MAX][NUM_FEATURE_BITS]; +static_assert(ARRAY_SIZE(btrfs_unknown_feature_names) == + ARRAY_SIZE(btrfs_feature_attrs)); +static_assert(ARRAY_SIZE(btrfs_unknown_feature_names[0]) == + ARRAY_SIZE(btrfs_feature_attrs[0])); + static const u64 supported_feature_masks[FEAT_MAX] = { [FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP, [FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP, @@ -1272,11 +1280,6 @@ static void init_feature_attrs(void) struct btrfs_feature_attr *fa; int set, i; - BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names) != - ARRAY_SIZE(btrfs_feature_attrs)); - BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names[0]) != - ARRAY_SIZE(btrfs_feature_attrs[0])); - memset(btrfs_feature_attrs, 0, sizeof(btrfs_feature_attrs)); memset(btrfs_unknown_feature_names, 0, sizeof(btrfs_unknown_feature_names)); diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 319fed82d741..c5b3a631bf4f 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -15,6 +15,7 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree) struct extent_map *em; struct rb_node *node; + write_lock(&em_tree->lock); while (!RB_EMPTY_ROOT(&em_tree->map.rb_root)) { node = rb_first_cached(&em_tree->map); em = rb_entry(node, struct extent_map, rb_node); @@ -32,6 +33,7 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree) #endif free_extent_map(em); } + write_unlock(&em_tree->lock); } /* diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 03de89b45f27..b008c5110958 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -854,7 +854,37 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root) static noinline void wait_for_commit(struct btrfs_transaction *commit, const enum btrfs_trans_state min_state) { - wait_event(commit->commit_wait, commit->state >= min_state); + struct btrfs_fs_info *fs_info = commit->fs_info; + u64 transid = commit->transid; + bool put = false; + + while (1) { + wait_event(commit->commit_wait, commit->state >= min_state); + if (put) + btrfs_put_transaction(commit); + + if (min_state < TRANS_STATE_COMPLETED) + break; + + /* + * A transaction isn't really completed until all of the + * previous transactions are completed, but with fsync we can + * end up with SUPER_COMMITTED transactions before a COMPLETED + * transaction. Wait for those. + */ + + spin_lock(&fs_info->trans_lock); + commit = list_first_entry_or_null(&fs_info->trans_list, + struct btrfs_transaction, + list); + if (!commit || commit->transid > transid) { + spin_unlock(&fs_info->trans_lock); + break; + } + refcount_inc(&commit->use_count); + put = true; + spin_unlock(&fs_info->trans_lock); + } } int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) @@ -1320,6 +1350,32 @@ again: } /* + * If we had a pending drop we need to see if there are any others left in our + * dead roots list, and if not clear our bit and wake any waiters. + */ +void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info) +{ + /* + * We put the drop in progress roots at the front of the list, so if the + * first entry doesn't have UNFINISHED_DROP set we can wake everybody + * up. + */ + spin_lock(&fs_info->trans_lock); + if (!list_empty(&fs_info->dead_roots)) { + struct btrfs_root *root = list_first_entry(&fs_info->dead_roots, + struct btrfs_root, + root_list); + if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) { + spin_unlock(&fs_info->trans_lock); + return; + } + } + spin_unlock(&fs_info->trans_lock); + + btrfs_wake_unfinished_drop(fs_info); +} + +/* * dead roots are old snapshots that need to be deleted. This allocates * a dirty root struct and adds it into the list of dead roots that need to * be deleted @@ -1331,7 +1387,12 @@ void btrfs_add_dead_root(struct btrfs_root *root) spin_lock(&fs_info->trans_lock); if (list_empty(&root->root_list)) { btrfs_grab_root(root); - list_add_tail(&root->root_list, &fs_info->dead_roots); + + /* We want to process the partially complete drops first. */ + if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) + list_add(&root->root_list, &fs_info->dead_roots); + else + list_add_tail(&root->root_list, &fs_info->dead_roots); } spin_unlock(&fs_info->trans_lock); } @@ -1850,6 +1911,14 @@ static void update_super_roots(struct btrfs_fs_info *fs_info) super->cache_generation = 0; if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags)) super->uuid_tree_generation = root_item->generation; + + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + root_item = &fs_info->block_group_root->root_item; + + super->block_group_root = root_item->bytenr; + super->block_group_root_generation = root_item->generation; + super->block_group_root_level = root_item->level; + } } int btrfs_transaction_in_commit(struct btrfs_fs_info *info) @@ -1981,16 +2050,24 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans) static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) { /* - * We use writeback_inodes_sb here because if we used + * We use try_to_writeback_inodes_sb() here because if we used * btrfs_start_delalloc_roots we would deadlock with fs freeze. * Currently are holding the fs freeze lock, if we do an async flush * we'll do btrfs_join_transaction() and deadlock because we need to * wait for the fs freeze lock. Using the direct flushing we benefit * from already being in a transaction and our join_transaction doesn't * have to re-take the fs freeze lock. + * + * Note that try_to_writeback_inodes_sb() will only trigger writeback + * if it can read lock sb->s_umount. It will always be able to lock it, + * except when the filesystem is being unmounted or being frozen, but in + * those cases sync_filesystem() is called, which results in calling + * writeback_inodes_sb() while holding a write lock on sb->s_umount. + * Note that we don't call writeback_inodes_sb() directly, because it + * will emit a warning if sb->s_umount is not locked. */ if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) - writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC); + try_to_writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC); return 0; } @@ -2000,6 +2077,27 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } +/* + * Add a pending snapshot associated with the given transaction handle to the + * respective handle. This must be called after the transaction commit started + * and while holding fs_info->trans_lock. + * This serves to guarantee a caller of btrfs_commit_transaction() that it can + * safely free the pending snapshot pointer in case btrfs_commit_transaction() + * returns an error. + */ +static void add_pending_snapshot(struct btrfs_trans_handle *trans) +{ + struct btrfs_transaction *cur_trans = trans->transaction; + + if (!trans->pending_snapshot) + return; + + lockdep_assert_held(&trans->fs_info->trans_lock); + ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_START); + + list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots); +} + int btrfs_commit_transaction(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -2073,6 +2171,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (cur_trans->state >= TRANS_STATE_COMMIT_START) { enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; + add_pending_snapshot(trans); + spin_unlock(&fs_info->trans_lock); refcount_inc(&cur_trans->use_count); @@ -2163,6 +2263,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * COMMIT_DOING so make sure to wait for num_writers to == 1 again. */ spin_lock(&fs_info->trans_lock); + add_pending_snapshot(trans); cur_trans->state = TRANS_STATE_COMMIT_DOING; spin_unlock(&fs_info->trans_lock); wait_event(cur_trans->writer_wait, @@ -2269,6 +2370,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) list_add_tail(&fs_info->chunk_root->dirty_list, &cur_trans->switch_commits); + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_set_root_node(&fs_info->block_group_root->root_item, + fs_info->block_group_root->node); + list_add_tail(&fs_info->block_group_root->dirty_list, + &cur_trans->switch_commits); + } + switch_commit_roots(trans); ASSERT(list_empty(&cur_trans->dirty_bgs)); @@ -2397,10 +2505,10 @@ cleanup_transaction: * because btrfs_commit_super will poke cleaner thread and it will process it a * few seconds later. */ -int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) +int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) { + struct btrfs_root *root; int ret; - struct btrfs_fs_info *fs_info = root->fs_info; spin_lock(&fs_info->trans_lock); if (list_empty(&fs_info->dead_roots)) { diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 1852ed9de7fd..970ff316069d 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -123,6 +123,8 @@ struct btrfs_trans_handle { struct btrfs_transaction *transaction; struct btrfs_block_rsv *block_rsv; struct btrfs_block_rsv *orig_rsv; + /* Set by a task that wants to create a snapshot. */ + struct btrfs_pending_snapshot *pending_snapshot; refcount_t use_count; unsigned int type; /* @@ -214,7 +216,8 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid); void btrfs_add_dead_root(struct btrfs_root *root); int btrfs_defrag_root(struct btrfs_root *root); -int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); +void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info); +int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info); int btrfs_commit_transaction(struct btrfs_trans_handle *trans); void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans); int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 72e1c942197d..e56c0107eea3 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -639,8 +639,10 @@ static void block_group_err(const struct extent_buffer *eb, int slot, static int check_block_group_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { + struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_block_group_item bgi; u32 item_size = btrfs_item_size(leaf, slot); + u64 chunk_objectid; u64 flags; u64 type; @@ -663,8 +665,23 @@ static int check_block_group_item(struct extent_buffer *leaf, read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), sizeof(bgi)); - if (unlikely(btrfs_stack_block_group_chunk_objectid(&bgi) != - BTRFS_FIRST_CHUNK_TREE_OBJECTID)) { + chunk_objectid = btrfs_stack_block_group_chunk_objectid(&bgi); + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + /* + * We don't init the nr_global_roots until we load the global + * roots, so this could be 0 at mount time. If it's 0 we'll + * just assume we're fine, and later we'll check against our + * actual value. + */ + if (unlikely(fs_info->nr_global_roots && + chunk_objectid >= fs_info->nr_global_roots)) { + block_group_err(leaf, slot, + "invalid block group global root id, have %llu, needs to be <= %llu", + chunk_objectid, + fs_info->nr_global_roots); + return -EUCLEAN; + } + } else if (unlikely(chunk_objectid != BTRFS_FIRST_CHUNK_TREE_OBJECTID)) { block_group_err(leaf, slot, "invalid block group chunk objectid, have %llu expect %llu", btrfs_stack_block_group_chunk_objectid(&bgi), @@ -965,6 +982,7 @@ static int check_dev_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { struct btrfs_dev_item *ditem; + const u32 item_size = btrfs_item_size(leaf, slot); if (unlikely(key->objectid != BTRFS_DEV_ITEMS_OBJECTID)) { dev_item_err(leaf, slot, @@ -972,6 +990,13 @@ static int check_dev_item(struct extent_buffer *leaf, key->objectid, BTRFS_DEV_ITEMS_OBJECTID); return -EUCLEAN; } + + if (unlikely(item_size != sizeof(*ditem))) { + dev_item_err(leaf, slot, "invalid item size: has %u expect %zu", + item_size, sizeof(*ditem)); + return -EUCLEAN; + } + ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item); if (unlikely(btrfs_device_id(leaf, ditem) != key->offset)) { dev_item_err(leaf, slot, @@ -1007,6 +1032,7 @@ static int check_inode_item(struct extent_buffer *leaf, struct btrfs_inode_item *iitem; u64 super_gen = btrfs_super_generation(fs_info->super_copy); u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777); + const u32 item_size = btrfs_item_size(leaf, slot); u32 mode; int ret; u32 flags; @@ -1016,6 +1042,12 @@ static int check_inode_item(struct extent_buffer *leaf, if (unlikely(ret < 0)) return ret; + if (unlikely(item_size != sizeof(*iitem))) { + generic_err(leaf, slot, "invalid item size: has %u expect %zu", + item_size, sizeof(*iitem)); + return -EUCLEAN; + } + iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item); /* Here we use super block generation + 1 to handle log tree */ @@ -1633,7 +1665,6 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) /* These trees must never be empty */ if (unlikely(owner == BTRFS_ROOT_TREE_OBJECTID || owner == BTRFS_CHUNK_TREE_OBJECTID || - owner == BTRFS_EXTENT_TREE_OBJECTID || owner == BTRFS_DEV_TREE_OBJECTID || owner == BTRFS_FS_TREE_OBJECTID || owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) { @@ -1642,12 +1673,25 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) owner); return -EUCLEAN; } + /* Unknown tree */ if (unlikely(owner == 0)) { generic_err(leaf, 0, "invalid owner, root 0 is not defined"); return -EUCLEAN; } + + /* EXTENT_TREE_V2 can have empty extent trees. */ + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + return 0; + + if (unlikely(owner == BTRFS_EXTENT_TREE_OBJECTID)) { + generic_err(leaf, 0, + "invalid root, root %llu must never be empty", + owner); + return -EUCLEAN; + } + return 0; } @@ -1667,6 +1711,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) */ for (slot = 0; slot < nritems; slot++) { u32 item_end_expected; + u64 item_data_end; int ret; btrfs_item_key_to_cpu(leaf, &key, slot); @@ -1681,6 +1726,8 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) return -EUCLEAN; } + item_data_end = (u64)btrfs_item_offset(leaf, slot) + + btrfs_item_size(leaf, slot); /* * Make sure the offset and ends are right, remember that the * item data starts at the end of the leaf and grows towards the @@ -1691,11 +1738,10 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) else item_end_expected = btrfs_item_offset(leaf, slot - 1); - if (unlikely(btrfs_item_data_end(leaf, slot) != item_end_expected)) { + if (unlikely(item_data_end != item_end_expected)) { generic_err(leaf, slot, - "unexpected item end, have %u expect %u", - btrfs_item_data_end(leaf, slot), - item_end_expected); + "unexpected item end, have %llu expect %u", + item_data_end, item_end_expected); return -EUCLEAN; } @@ -1704,12 +1750,10 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) * just in case all the items are consistent to each other, but * all point outside of the leaf. */ - if (unlikely(btrfs_item_data_end(leaf, slot) > - BTRFS_LEAF_DATA_SIZE(fs_info))) { + if (unlikely(item_data_end > BTRFS_LEAF_DATA_SIZE(fs_info))) { generic_err(leaf, slot, - "slot end outside of leaf, have %u expect range [0, %u]", - btrfs_item_data_end(leaf, slot), - BTRFS_LEAF_DATA_SIZE(fs_info)); + "slot end outside of leaf, have %llu expect range [0, %u]", + item_data_end, BTRFS_LEAF_DATA_SIZE(fs_info)); return -EUCLEAN; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c1ddbe800897..571dae8ad65e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -270,12 +270,6 @@ void btrfs_end_log_trans(struct btrfs_root *root) } } -static int btrfs_write_tree_block(struct extent_buffer *buf) -{ - return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start, - buf->start + buf->len - 1); -} - static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) { filemap_fdatawait_range(buf->pages[0]->mapping, @@ -294,16 +288,6 @@ struct walk_control { */ int free; - /* should we write out the extent buffer? This is used - * while flushing the log tree to disk during a sync - */ - int write; - - /* should we wait for the extent buffer io to finish? Also used - * while flushing the log tree to disk for a sync - */ - int wait; - /* pin only walk, we record which extents on disk belong to the * log trees */ @@ -354,17 +338,15 @@ static int process_one_buffer(struct btrfs_root *log, return ret; } - if (wc->pin) + if (wc->pin) { ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start, eb->len); + if (ret) + return ret; - if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { - if (wc->pin && btrfs_header_level(eb) == 0) + if (btrfs_buffer_uptodate(eb, gen, 0) && + btrfs_header_level(eb) == 0) ret = btrfs_exclude_logged_extents(eb); - if (wc->write) - btrfs_write_tree_block(eb); - if (wc->wait) - btrfs_wait_tree_block_writeback(eb); } return ret; } @@ -917,6 +899,26 @@ out: return ret; } +static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans, + struct btrfs_inode *dir, + struct btrfs_inode *inode, + const char *name, + int name_len) +{ + int ret; + + ret = btrfs_unlink_inode(trans, dir, inode, name, name_len); + if (ret) + return ret; + /* + * Whenever we need to check if a name exists or not, we check the + * fs/subvolume tree. So after an unlink we must run delayed items, so + * that future checks for a name during log replay see that the name + * does not exists anymore. + */ + return btrfs_run_delayed_items(trans); +} + /* * when cleaning up conflicts between the directory names in the * subvolume, directory names in the log and directory names in the @@ -959,12 +961,8 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = btrfs_unlink_inode(trans, dir, BTRFS_I(inode), name, + ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), name, name_len); - if (ret) - goto out; - else - ret = btrfs_run_delayed_items(trans); out: kfree(name); iput(inode); @@ -1124,14 +1122,11 @@ again: inc_nlink(&inode->vfs_inode); btrfs_release_path(path); - ret = btrfs_unlink_inode(trans, dir, inode, + ret = unlink_inode_for_log_replay(trans, dir, inode, victim_name, victim_name_len); kfree(victim_name); if (ret) return ret; - ret = btrfs_run_delayed_items(trans); - if (ret) - return ret; *search_done = 1; goto again; } @@ -1196,14 +1191,11 @@ again: inc_nlink(&inode->vfs_inode); btrfs_release_path(path); - ret = btrfs_unlink_inode(trans, + ret = unlink_inode_for_log_replay(trans, BTRFS_I(victim_parent), inode, victim_name, victim_name_len); - if (!ret) - ret = btrfs_run_delayed_items( - trans); } iput(victim_parent); kfree(victim_name); @@ -1358,7 +1350,7 @@ again: kfree(name); goto out; } - ret = btrfs_unlink_inode(trans, BTRFS_I(dir), + ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), inode, name, namelen); kfree(name); iput(dir); @@ -1457,8 +1449,8 @@ static int add_link(struct btrfs_trans_handle *trans, ret = -ENOENT; goto out; } - ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(other_inode), - name, namelen); + ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(other_inode), + name, namelen); if (ret) goto out; /* @@ -1467,10 +1459,6 @@ static int add_link(struct btrfs_trans_handle *trans, */ if (other_inode->i_nlink == 0) inc_nlink(other_inode); - - ret = btrfs_run_delayed_items(trans); - if (ret) - goto out; add_link: ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, namelen, 0, ref_index); @@ -1603,7 +1591,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, ret = btrfs_inode_ref_exists(inode, dir, key->type, name, namelen); if (ret > 0) { - ret = btrfs_unlink_inode(trans, + ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode), name, namelen); @@ -2350,15 +2338,8 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, goto out; inc_nlink(inode); - ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(inode), name, - name_len); - if (ret) - goto out; - - ret = btrfs_run_delayed_items(trans); - if (ret) - goto out; - + ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode), + name, name_len); /* * Unlike dir item keys, dir index keys can only have one name (entry) in * them, as there are no key collisions since each key has a unique offset @@ -3414,6 +3395,29 @@ static void free_log_tree(struct btrfs_trans_handle *trans, if (log->node) { ret = walk_log_tree(trans, log, &wc); if (ret) { + /* + * We weren't able to traverse the entire log tree, the + * typical scenario is getting an -EIO when reading an + * extent buffer of the tree, due to a previous writeback + * failure of it. + */ + set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, + &log->fs_info->fs_state); + + /* + * Some extent buffers of the log tree may still be dirty + * and not yet written back to storage, because we may + * have updates to a log tree without syncing a log tree, + * such as during rename and link operations. So flush + * them out and wait for their writeback to complete, so + * that we properly cleanup their state and pages. + */ + btrfs_write_marked_extents(log->fs_info, + &log->dirty_log_pages, + EXTENT_DIRTY | EXTENT_NEW); + btrfs_wait_tree_log_extents(log, + EXTENT_DIRTY | EXTENT_NEW); + if (trans) btrfs_abort_transaction(trans, ret); else @@ -3454,35 +3458,156 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, } /* - * Check if an inode was logged in the current transaction. This may often - * return some false positives, because logged_trans is an in memory only field, - * not persisted anywhere. This is meant to be used in contexts where a false - * positive has no functional consequences. + * Check if an inode was logged in the current transaction. This correctly deals + * with the case where the inode was logged but has a logged_trans of 0, which + * happens if the inode is evicted and loaded again, as logged_trans is an in + * memory only field (not persisted). + * + * Returns 1 if the inode was logged before in the transaction, 0 if it was not, + * and < 0 on error. */ -static bool inode_logged(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode) +static int inode_logged(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path_in) { + struct btrfs_path *path = path_in; + struct btrfs_key key; + int ret; + if (inode->logged_trans == trans->transid) - return true; + return 1; - if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) - return false; + /* + * If logged_trans is not 0, then we know the inode logged was not logged + * in this transaction, so we can return false right away. + */ + if (inode->logged_trans > 0) + return 0; /* - * The inode's logged_trans is always 0 when we load it (because it is - * not persisted in the inode item or elsewhere). So if it is 0, the - * inode was last modified in the current transaction then the inode may - * have been logged before in the current transaction, then evicted and - * loaded again in the current transaction - or may have never been logged - * in the current transaction, but since we can not be sure, we have to - * assume it was, otherwise our callers can leave an inconsistent log. + * If no log tree was created for this root in this transaction, then + * the inode can not have been logged in this transaction. In that case + * set logged_trans to anything greater than 0 and less than the current + * transaction's ID, to avoid the search below in a future call in case + * a log tree gets created after this. */ - if (inode->logged_trans == 0 && - inode->last_trans == trans->transid && - !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags)) - return true; + if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) { + inode->logged_trans = trans->transid - 1; + return 0; + } - return false; + /* + * We have a log tree and the inode's logged_trans is 0. We can't tell + * for sure if the inode was logged before in this transaction by looking + * only at logged_trans. We could be pessimistic and assume it was, but + * that can lead to unnecessarily logging an inode during rename and link + * operations, and then further updating the log in followup rename and + * link operations, specially if it's a directory, which adds latency + * visible to applications doing a series of rename or link operations. + * + * A logged_trans of 0 here can mean several things: + * + * 1) The inode was never logged since the filesystem was mounted, and may + * or may have not been evicted and loaded again; + * + * 2) The inode was logged in a previous transaction, then evicted and + * then loaded again; + * + * 3) The inode was logged in the current transaction, then evicted and + * then loaded again. + * + * For cases 1) and 2) we don't want to return true, but we need to detect + * case 3) and return true. So we do a search in the log root for the inode + * item. + */ + key.objectid = btrfs_ino(inode); + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + if (!path) { + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + } + + ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0); + + if (path_in) + btrfs_release_path(path); + else + btrfs_free_path(path); + + /* + * Logging an inode always results in logging its inode item. So if we + * did not find the item we know the inode was not logged for sure. + */ + if (ret < 0) { + return ret; + } else if (ret > 0) { + /* + * Set logged_trans to a value greater than 0 and less then the + * current transaction to avoid doing the search in future calls. + */ + inode->logged_trans = trans->transid - 1; + return 0; + } + + /* + * The inode was previously logged and then evicted, set logged_trans to + * the current transacion's ID, to avoid future tree searches as long as + * the inode is not evicted again. + */ + inode->logged_trans = trans->transid; + + /* + * If it's a directory, then we must set last_dir_index_offset to the + * maximum possible value, so that the next attempt to log the inode does + * not skip checking if dir index keys found in modified subvolume tree + * leaves have been logged before, otherwise it would result in attempts + * to insert duplicate dir index keys in the log tree. This must be done + * because last_dir_index_offset is an in-memory only field, not persisted + * in the inode item or any other on-disk structure, so its value is lost + * once the inode is evicted. + */ + if (S_ISDIR(inode->vfs_inode.i_mode)) + inode->last_dir_index_offset = (u64)-1; + + return 1; +} + +/* + * Delete a directory entry from the log if it exists. + * + * Returns < 0 on error + * 1 if the entry does not exists + * 0 if the entry existed and was successfully deleted + */ +static int del_logged_dentry(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *path, + u64 dir_ino, + const char *name, int name_len, + u64 index) +{ + struct btrfs_dir_item *di; + + /* + * We only log dir index items of a directory, so we don't need to look + * for dir item keys. + */ + di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, + index, name, name_len, -1); + if (IS_ERR(di)) + return PTR_ERR(di); + else if (!di) + return 1; + + /* + * We do not need to update the size field of the directory's + * inode item because on log replay we update the field to reflect + * all existing entries in the directory (see overwrite_item()). + */ + return btrfs_delete_one_dir_name(trans, log, path, di); } /* @@ -3511,15 +3636,16 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *dir, u64 index) { - struct btrfs_root *log; - struct btrfs_dir_item *di; struct btrfs_path *path; int ret; - int err = 0; - u64 dir_ino = btrfs_ino(dir); - if (!inode_logged(trans, dir)) + ret = inode_logged(trans, dir, NULL); + if (ret == 0) return; + else if (ret < 0) { + btrfs_set_log_full_commit(trans); + return; + } ret = join_running_log_trans(root); if (ret) @@ -3527,41 +3653,18 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, mutex_lock(&dir->log_mutex); - log = root->log_root; path = btrfs_alloc_path(); if (!path) { - err = -ENOMEM; + ret = -ENOMEM; goto out_unlock; } - /* - * We only log dir index items of a directory, so we don't need to look - * for dir item keys. - */ - di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, - index, name, name_len, -1); - if (IS_ERR(di)) { - err = PTR_ERR(di); - goto fail; - } - if (di) { - ret = btrfs_delete_one_dir_name(trans, log, path, di); - if (ret) { - err = ret; - goto fail; - } - } - - /* - * We do not need to update the size field of the directory's inode item - * because on log replay we update the field to reflect all existing - * entries in the directory (see overwrite_item()). - */ -fail: + ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir), + name, name_len, index); btrfs_free_path(path); out_unlock: mutex_unlock(&dir->log_mutex); - if (err < 0) + if (ret < 0) btrfs_set_log_full_commit(trans); btrfs_end_log_trans(root); } @@ -3576,8 +3679,13 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, u64 index; int ret; - if (!inode_logged(trans, inode)) + ret = inode_logged(trans, inode, NULL); + if (ret == 0) return; + else if (ret < 0) { + btrfs_set_log_full_commit(trans); + return; + } ret = join_running_log_trans(root); if (ret) @@ -3702,19 +3810,20 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_path *dst_path, - struct btrfs_log_ctx *ctx) + struct btrfs_log_ctx *ctx, + u64 *last_old_dentry_offset) { struct btrfs_root *log = inode->root->log_root; struct extent_buffer *src = path->nodes[0]; const int nritems = btrfs_header_nritems(src); const u64 ino = btrfs_ino(inode); - const bool inode_logged_before = inode_logged(trans, inode); bool last_found = false; int batch_start = 0; int batch_size = 0; int i; for (i = path->slots[0]; i < nritems; i++) { + struct btrfs_dir_item *di; struct btrfs_key key; int ret; @@ -3725,7 +3834,34 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, break; } + di = btrfs_item_ptr(src, i, struct btrfs_dir_item); ctx->last_dir_item_offset = key.offset; + + /* + * Skip ranges of items that consist only of dir item keys created + * in past transactions. However if we find a gap, we must log a + * dir index range item for that gap, so that index keys in that + * gap are deleted during log replay. + */ + if (btrfs_dir_transid(src, di) < trans->transid) { + if (key.offset > *last_old_dentry_offset + 1) { + ret = insert_dir_log_key(trans, log, dst_path, + ino, *last_old_dentry_offset + 1, + key.offset - 1); + /* + * -EEXIST should never happen because when we + * log a directory in full mode (LOG_INODE_ALL) + * we drop all BTRFS_DIR_LOG_INDEX_KEY keys from + * the log tree. + */ + ASSERT(ret != -EEXIST); + if (ret < 0) + return ret; + } + + *last_old_dentry_offset = key.offset; + continue; + } /* * We must make sure that when we log a directory entry, the * corresponding inode, after log replay, has a matching link @@ -3749,25 +3885,23 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, * resulting in -ENOTEMPTY errors. */ if (!ctx->log_new_dentries) { - struct btrfs_dir_item *di; struct btrfs_key di_key; - di = btrfs_item_ptr(src, i, struct btrfs_dir_item); btrfs_dir_item_key_to_cpu(src, di, &di_key); - if ((btrfs_dir_transid(src, di) == trans->transid || - btrfs_dir_type(src, di) == BTRFS_FT_DIR) && - di_key.type != BTRFS_ROOT_ITEM_KEY) + if (di_key.type != BTRFS_ROOT_ITEM_KEY) ctx->log_new_dentries = true; } - if (!inode_logged_before) + if (!ctx->logged_before) goto add_to_batch; /* * If we were logged before and have logged dir items, we can skip * checking if any item with a key offset larger than the last one * we logged is in the log tree, saving time and avoiding adding - * contention on the log tree. + * contention on the log tree. We can only rely on the value of + * last_dir_index_offset when we know for sure that the inode was + * previously logged in the current transaction. */ if (key.offset > inode->last_dir_index_offset) goto add_to_batch; @@ -3837,7 +3971,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, struct btrfs_root *log = root->log_root; int err = 0; int ret; - u64 first_offset = min_offset; + u64 last_old_dentry_offset = min_offset - 1; u64 last_offset = (u64)-1; u64 ino = btrfs_ino(inode); @@ -3871,10 +4005,11 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, */ if (ret == 0) { struct btrfs_key tmp; + btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); if (tmp.type == BTRFS_DIR_INDEX_KEY) - first_offset = max(min_offset, tmp.offset) + 1; + last_old_dentry_offset = tmp.offset; } goto done; } @@ -3883,17 +4018,18 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); if (ret == 0) { struct btrfs_key tmp; + btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); - if (tmp.type == BTRFS_DIR_INDEX_KEY) { - first_offset = tmp.offset; - ret = overwrite_item(trans, log, dst_path, - path->nodes[0], path->slots[0], - &tmp); - if (ret) { - err = ret; - goto done; - } - } + /* + * The dir index key before the first one we found that needs to + * be logged might be in a previous leaf, and there might be a + * gap between these keys, meaning that we had deletions that + * happened. So the key range item we log (key type + * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the + * previous key's offset plus 1, so that those deletes are replayed. + */ + if (tmp.type == BTRFS_DIR_INDEX_KEY) + last_old_dentry_offset = tmp.offset; } btrfs_release_path(path); @@ -3915,7 +4051,8 @@ search: * from our directory */ while (1) { - ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx); + ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx, + &last_old_dentry_offset); if (ret != 0) { if (ret < 0) err = ret; @@ -3941,14 +4078,16 @@ search: goto done; } if (btrfs_header_generation(path->nodes[0]) != trans->transid) { - ctx->last_dir_item_offset = min_key.offset; - ret = overwrite_item(trans, log, dst_path, - path->nodes[0], path->slots[0], - &min_key); - if (ret) - err = ret; - else - last_offset = min_key.offset; + /* + * The next leaf was not changed in the current transaction + * and has at least one dir index key. + * We check for the next key because there might have been + * one or more deletions between the last key we logged and + * that next key. So the key range item we log (key type + * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's + * offset minus 1, so that those deletes are replayed. + */ + last_offset = min_key.offset - 1; goto done; } if (need_resched()) { @@ -3964,13 +4103,21 @@ done: if (err == 0) { *last_offset_ret = last_offset; /* - * insert the log range keys to indicate where the log - * is valid + * In case the leaf was changed in the current transaction but + * all its dir items are from a past transaction, the last item + * in the leaf is a dir item and there's no gap between that last + * dir item and the first one on the next leaf (which did not + * change in the current transaction), then we don't need to log + * a range, last_old_dentry_offset is == to last_offset. */ - ret = insert_dir_log_key(trans, log, path, ino, first_offset, - last_offset); - if (ret) - err = ret; + ASSERT(last_old_dentry_offset <= last_offset); + if (last_old_dentry_offset < last_offset) { + ret = insert_dir_log_key(trans, log, path, ino, + last_old_dentry_offset + 1, + last_offset); + if (ret) + err = ret; + } } return err; } @@ -3997,22 +4144,7 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans, u64 max_key; int ret; - /* - * If this is the first time we are being logged in the current - * transaction, or we were logged before but the inode was evicted and - * reloaded later, in which case its logged_trans is 0, reset the value - * of the last logged key offset. Note that we don't use the helper - * function inode_logged() here - that is because the function returns - * true after an inode eviction, assuming the worst case as it can not - * know for sure if the inode was logged before. So we can not skip key - * searches in the case the inode was evicted, because it may not have - * been logged in this transaction and may have been logged in a past - * transaction, so we need to reset the last dir index offset to (u64)-1. - */ - if (inode->logged_trans != trans->transid) - inode->last_dir_index_offset = (u64)-1; - - min_key = 0; + min_key = BTRFS_DIR_START_INDEX; max_key = 0; ctx->last_dir_item_offset = inode->last_dir_index_offset; @@ -4048,9 +4180,6 @@ static int drop_inode_items(struct btrfs_trans_handle *trans, struct btrfs_key found_key; int start_slot; - if (!inode_logged(trans, inode)) - return 0; - key.objectid = btrfs_ino(inode); key.type = max_key_type; key.offset = (u64)-1; @@ -4270,23 +4399,18 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, int start_slot, int nr, int inode_only, u64 logged_isize) { - struct btrfs_fs_info *fs_info = trans->fs_info; - unsigned long src_offset; - unsigned long dst_offset; struct btrfs_root *log = inode->root->log_root; struct btrfs_file_extent_item *extent; - struct btrfs_inode_item *inode_item; struct extent_buffer *src = src_path->nodes[0]; - int ret; + int ret = 0; struct btrfs_key *ins_keys; u32 *ins_sizes; struct btrfs_item_batch batch; char *ins_data; int i; - struct list_head ordered_sums; - int skip_csum = inode->flags & BTRFS_INODE_NODATASUM; - - INIT_LIST_HEAD(&ordered_sums); + int dst_index; + const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM); + const u64 i_size = i_size_read(&inode->vfs_inode); ins_data = kmalloc(nr * sizeof(struct btrfs_key) + nr * sizeof(u32), GFP_NOFS); @@ -4298,28 +4422,152 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, batch.keys = ins_keys; batch.data_sizes = ins_sizes; batch.total_data_size = 0; - batch.nr = nr; + batch.nr = 0; + dst_index = 0; for (i = 0; i < nr; i++) { - ins_sizes[i] = btrfs_item_size(src, i + start_slot); - batch.total_data_size += ins_sizes[i]; - btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); + const int src_slot = start_slot + i; + struct btrfs_root *csum_root; + struct btrfs_ordered_sum *sums; + struct btrfs_ordered_sum *sums_next; + LIST_HEAD(ordered_sums); + u64 disk_bytenr; + u64 disk_num_bytes; + u64 extent_offset; + u64 extent_num_bytes; + bool is_old_extent; + + btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot); + + if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY) + goto add_to_batch; + + extent = btrfs_item_ptr(src, src_slot, + struct btrfs_file_extent_item); + + is_old_extent = (btrfs_file_extent_generation(src, extent) < + trans->transid); + + /* + * Don't copy extents from past generations. That would make us + * log a lot more metadata for common cases like doing only a + * few random writes into a file and then fsync it for the first + * time or after the full sync flag is set on the inode. We can + * get leaves full of extent items, most of which are from past + * generations, so we can skip them - as long as the inode has + * not been the target of a reflink operation in this transaction, + * as in that case it might have had file extent items with old + * generations copied into it. We also must always log prealloc + * extents that start at or beyond eof, otherwise we would lose + * them on log replay. + */ + if (is_old_extent && + ins_keys[dst_index].offset < i_size && + inode->last_reflink_trans < trans->transid) + continue; + + if (skip_csum) + goto add_to_batch; + + /* Only regular extents have checksums. */ + if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG) + goto add_to_batch; + + /* + * If it's an extent created in a past transaction, then its + * checksums are already accessible from the committed csum tree, + * no need to log them. + */ + if (is_old_extent) + goto add_to_batch; + + disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent); + /* If it's an explicit hole, there are no checksums. */ + if (disk_bytenr == 0) + goto add_to_batch; + + disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent); + + if (btrfs_file_extent_compression(src, extent)) { + extent_offset = 0; + extent_num_bytes = disk_num_bytes; + } else { + extent_offset = btrfs_file_extent_offset(src, extent); + extent_num_bytes = btrfs_file_extent_num_bytes(src, extent); + } + + csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr); + disk_bytenr += extent_offset; + ret = btrfs_lookup_csums_range(csum_root, disk_bytenr, + disk_bytenr + extent_num_bytes - 1, + &ordered_sums, 0); + if (ret) + goto out; + + list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) { + if (!ret) + ret = log_csums(trans, inode, log, sums); + list_del(&sums->list); + kfree(sums); + } + if (ret) + goto out; + +add_to_batch: + ins_sizes[dst_index] = btrfs_item_size(src, src_slot); + batch.total_data_size += ins_sizes[dst_index]; + batch.nr++; + dst_index++; } + + /* + * We have a leaf full of old extent items that don't need to be logged, + * so we don't need to do anything. + */ + if (batch.nr == 0) + goto out; + ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); - if (ret) { - kfree(ins_data); - return ret; - } + if (ret) + goto out; + + dst_index = 0; + for (i = 0; i < nr; i++) { + const int src_slot = start_slot + i; + const int dst_slot = dst_path->slots[0] + dst_index; + struct btrfs_key key; + unsigned long src_offset; + unsigned long dst_offset; + + /* + * We're done, all the remaining items in the source leaf + * correspond to old file extent items. + */ + if (dst_index >= batch.nr) + break; + + btrfs_item_key_to_cpu(src, &key, src_slot); - for (i = 0; i < nr; i++, dst_path->slots[0]++) { - dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], - dst_path->slots[0]); + if (key.type != BTRFS_EXTENT_DATA_KEY) + goto copy_item; - src_offset = btrfs_item_ptr_offset(src, start_slot + i); + extent = btrfs_item_ptr(src, src_slot, + struct btrfs_file_extent_item); - if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { - inode_item = btrfs_item_ptr(dst_path->nodes[0], - dst_path->slots[0], + /* See the comment in the previous loop, same logic. */ + if (btrfs_file_extent_generation(src, extent) < trans->transid && + key.offset < i_size && + inode->last_reflink_trans < trans->transid) + continue; + +copy_item: + dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot); + src_offset = btrfs_item_ptr_offset(src, src_slot); + + if (key.type == BTRFS_INODE_ITEM_KEY) { + struct btrfs_inode_item *inode_item; + + inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot, struct btrfs_inode_item); fill_inode_item(trans, dst_path->nodes[0], inode_item, &inode->vfs_inode, @@ -4327,71 +4575,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, logged_isize); } else { copy_extent_buffer(dst_path->nodes[0], src, dst_offset, - src_offset, ins_sizes[i]); + src_offset, ins_sizes[dst_index]); } - /* take a reference on file data extents so that truncates - * or deletes of this inode don't have to relog the inode - * again - */ - if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && - !skip_csum) { - int found_type; - extent = btrfs_item_ptr(src, start_slot + i, - struct btrfs_file_extent_item); - - if (btrfs_file_extent_generation(src, extent) < trans->transid) - continue; - - found_type = btrfs_file_extent_type(src, extent); - if (found_type == BTRFS_FILE_EXTENT_REG) { - struct btrfs_root *csum_root; - u64 ds, dl, cs, cl; - ds = btrfs_file_extent_disk_bytenr(src, - extent); - /* ds == 0 is a hole */ - if (ds == 0) - continue; - - dl = btrfs_file_extent_disk_num_bytes(src, - extent); - cs = btrfs_file_extent_offset(src, extent); - cl = btrfs_file_extent_num_bytes(src, - extent); - if (btrfs_file_extent_compression(src, - extent)) { - cs = 0; - cl = dl; - } - - csum_root = btrfs_csum_root(fs_info, ds); - ret = btrfs_lookup_csums_range(csum_root, - ds + cs, ds + cs + cl - 1, - &ordered_sums, 0); - if (ret) - break; - } - } + dst_index++; } btrfs_mark_buffer_dirty(dst_path->nodes[0]); btrfs_release_path(dst_path); +out: kfree(ins_data); - /* - * we have to do this after the loop above to avoid changing the - * log tree while trying to change the log tree. - */ - while (!list_empty(&ordered_sums)) { - struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, - struct btrfs_ordered_sum, - list); - if (!ret) - ret = log_csums(trans, inode, log, sums); - list_del(&sums->list); - kfree(sums); - } - return ret; } @@ -4527,14 +4721,34 @@ static int log_one_extent(struct btrfs_trans_handle *trans, { struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_root *log = inode->root->log_root; - struct btrfs_file_extent_item *fi; + struct btrfs_file_extent_item fi = { 0 }; struct extent_buffer *leaf; - struct btrfs_map_token token; struct btrfs_key key; u64 extent_offset = em->start - em->orig_start; u64 block_len; int ret; + btrfs_set_stack_file_extent_generation(&fi, trans->transid); + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC); + else + btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG); + + block_len = max(em->block_len, em->orig_block_len); + if (em->compress_type != BTRFS_COMPRESS_NONE) { + btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start); + btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); + } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { + btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start - + extent_offset); + btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); + } + + btrfs_set_stack_file_extent_offset(&fi, extent_offset); + btrfs_set_stack_file_extent_num_bytes(&fi, em->len); + btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes); + btrfs_set_stack_file_extent_compression(&fi, em->compress_type); + ret = log_extent_csums(trans, inode, log, em, ctx); if (ret) return ret; @@ -4548,12 +4762,12 @@ static int log_one_extent(struct btrfs_trans_handle *trans, * are small, with a root at level 2 or 3 at most, due to their short * life span. */ - if (inode_logged(trans, inode)) { + if (ctx->logged_before) { drop_args.path = path; drop_args.start = em->start; drop_args.end = em->start + em->len; drop_args.replace_extent = true; - drop_args.extent_item_size = sizeof(*fi); + drop_args.extent_item_size = sizeof(fi); ret = btrfs_drop_extents(trans, log, inode, &drop_args); if (ret) return ret; @@ -4565,44 +4779,14 @@ static int log_one_extent(struct btrfs_trans_handle *trans, key.offset = em->start; ret = btrfs_insert_empty_item(trans, log, path, &key, - sizeof(*fi)); + sizeof(fi)); if (ret) return ret; } leaf = path->nodes[0]; - btrfs_init_map_token(&token, leaf); - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - - btrfs_set_token_file_extent_generation(&token, fi, trans->transid); - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - btrfs_set_token_file_extent_type(&token, fi, - BTRFS_FILE_EXTENT_PREALLOC); - else - btrfs_set_token_file_extent_type(&token, fi, - BTRFS_FILE_EXTENT_REG); - - block_len = max(em->block_len, em->orig_block_len); - if (em->compress_type != BTRFS_COMPRESS_NONE) { - btrfs_set_token_file_extent_disk_bytenr(&token, fi, - em->block_start); - btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len); - } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { - btrfs_set_token_file_extent_disk_bytenr(&token, fi, - em->block_start - - extent_offset); - btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len); - } else { - btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0); - btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0); - } - - btrfs_set_token_file_extent_offset(&token, fi, extent_offset); - btrfs_set_token_file_extent_num_bytes(&token, fi, em->len); - btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes); - btrfs_set_token_file_extent_compression(&token, fi, em->compress_type); - btrfs_set_token_file_extent_encryption(&token, fi, 0); - btrfs_set_token_file_extent_other_encoding(&token, fi, 0); + write_extent_buffer(leaf, &fi, + btrfs_item_ptr_offset(leaf, path->slots[0]), + sizeof(fi)); btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); @@ -4612,7 +4796,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, /* * Log all prealloc extents beyond the inode's i_size to make sure we do not - * lose them after doing a fast fsync and replaying the log. We scan the + * lose them after doing a full/fast fsync and replaying the log. We scan the * subvolume's root instead of iterating the inode's extent map tree because * otherwise we can log incorrect extent items based on extent map conversion. * That can happen due to the fact that extent maps are merged when they @@ -4816,7 +5000,6 @@ process: WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); - btrfs_release_path(path); if (!ret) ret = btrfs_log_prealloc_extents(trans, inode, path); if (ret) @@ -5391,6 +5574,7 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx, bool *need_log_inode_item) { + const u64 i_size = i_size_read(&inode->vfs_inode); struct btrfs_root *root = inode->root; int ins_start_slot = 0; int ins_nr = 0; @@ -5411,13 +5595,21 @@ again: if (min_key->type > max_key->type) break; - if (min_key->type == BTRFS_INODE_ITEM_KEY) + if (min_key->type == BTRFS_INODE_ITEM_KEY) { *need_log_inode_item = false; - - if ((min_key->type == BTRFS_INODE_REF_KEY || - min_key->type == BTRFS_INODE_EXTREF_KEY) && - inode->generation == trans->transid && - !recursive_logging) { + } else if (min_key->type == BTRFS_EXTENT_DATA_KEY && + min_key->offset >= i_size) { + /* + * Extents at and beyond eof are logged with + * btrfs_log_prealloc_extents(). + * Only regular files have BTRFS_EXTENT_DATA_KEY keys, + * and no keys greater than that, so bail out. + */ + break; + } else if ((min_key->type == BTRFS_INODE_REF_KEY || + min_key->type == BTRFS_INODE_EXTREF_KEY) && + inode->generation == trans->transid && + !recursive_logging) { u64 other_ino = 0; u64 other_parent = 0; @@ -5448,10 +5640,8 @@ again: btrfs_release_path(path); goto next_key; } - } - - /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ - if (min_key->type == BTRFS_XATTR_ITEM_KEY) { + } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) { + /* Skip xattrs, logged later with btrfs_log_all_xattrs() */ if (ins_nr == 0) goto next_slot; ret = copy_items(trans, inode, dst_path, path, @@ -5503,10 +5693,29 @@ next_key: } else { break; } + + /* + * We may process many leaves full of items for our inode, so + * avoid monopolizing a cpu for too long by rescheduling while + * not holding locks on any tree. + */ + cond_resched(); } - if (ins_nr) + if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, ins_start_slot, ins_nr, inode_only, logged_isize); + if (ret) + return ret; + } + + if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) { + /* + * Release the path because otherwise we might attempt to double + * lock the same leaf with btrfs_log_prealloc_extents() below. + */ + btrfs_release_path(path); + ret = btrfs_log_prealloc_extents(trans, inode, dst_path); + } return ret; } @@ -5535,8 +5744,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_key min_key; struct btrfs_key max_key; struct btrfs_root *log = inode->root->log_root; - int err = 0; - int ret = 0; + int ret; bool fast_search = false; u64 ino = btrfs_ino(inode); struct extent_map_tree *em_tree = &inode->extent_tree; @@ -5545,6 +5753,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, bool xattrs_logged = false; bool recursive_logging = false; bool inode_item_dropped = true; + const bool orig_logged_before = ctx->logged_before; path = btrfs_alloc_path(); if (!path) @@ -5578,8 +5787,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, * and figure out which index ranges have to be logged. */ if (S_ISDIR(inode->vfs_inode.i_mode)) { - err = btrfs_commit_inode_delayed_items(trans, inode); - if (err) + ret = btrfs_commit_inode_delayed_items(trans, inode); + if (ret) goto out; } @@ -5595,6 +5804,17 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } /* + * Before logging the inode item, cache the value returned by + * inode_logged(), because after that we have the need to figure out if + * the inode was previously logged in this transaction. + */ + ret = inode_logged(trans, inode, path); + if (ret < 0) + goto out_unlock; + ctx->logged_before = (ret == 1); + ret = 0; + + /* * This is for cases where logging a directory could result in losing a * a file after replaying the log. For example, if we move a file from a * directory A to a directory B, then fsync directory A, we have no way @@ -5605,7 +5825,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, inode_only == LOG_INODE_ALL && inode->last_unlink_trans >= trans->transid) { btrfs_set_log_full_commit(trans); - err = 1; + ret = 1; goto out_unlock; } @@ -5619,9 +5839,11 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); if (inode_only == LOG_INODE_EXISTS) max_key_type = BTRFS_XATTR_ITEM_KEY; - ret = drop_inode_items(trans, log, path, inode, max_key_type); + if (ctx->logged_before) + ret = drop_inode_items(trans, log, path, inode, + max_key_type); } else { - if (inode_only == LOG_INODE_EXISTS && inode_logged(trans, inode)) { + if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) { /* * Make sure the new inode item we write to the log has * the same isize as the current one (if it exists). @@ -5635,22 +5857,23 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, * (zeroes), as if an expanding truncate happened, * instead of getting a file of 4Kb only. */ - err = logged_inode_size(log, inode, path, &logged_isize); - if (err) + ret = logged_inode_size(log, inode, path, &logged_isize); + if (ret) goto out_unlock; } if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) { if (inode_only == LOG_INODE_EXISTS) { max_key.type = BTRFS_XATTR_ITEM_KEY; - ret = drop_inode_items(trans, log, path, inode, - max_key.type); + if (ctx->logged_before) + ret = drop_inode_items(trans, log, path, + inode, max_key.type); } else { clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); - if (inode_logged(trans, inode)) + if (ctx->logged_before) ret = truncate_inode_items(trans, log, inode, 0, 0); } @@ -5660,8 +5883,9 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (inode_only == LOG_INODE_ALL) fast_search = true; max_key.type = BTRFS_XATTR_ITEM_KEY; - ret = drop_inode_items(trans, log, path, inode, - max_key.type); + if (ctx->logged_before) + ret = drop_inode_items(trans, log, path, inode, + max_key.type); } else { if (inode_only == LOG_INODE_ALL) fast_search = true; @@ -5670,37 +5894,35 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } } - if (ret) { - err = ret; + if (ret) goto out_unlock; - } - err = copy_inode_items_to_log(trans, inode, &min_key, &max_key, + ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key, path, dst_path, logged_isize, recursive_logging, inode_only, ctx, &need_log_inode_item); - if (err) + if (ret) goto out_unlock; btrfs_release_path(path); btrfs_release_path(dst_path); - err = btrfs_log_all_xattrs(trans, inode, path, dst_path); - if (err) + ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); + if (ret) goto out_unlock; xattrs_logged = true; if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { btrfs_release_path(path); btrfs_release_path(dst_path); - err = btrfs_log_holes(trans, inode, path); - if (err) + ret = btrfs_log_holes(trans, inode, path); + if (ret) goto out_unlock; } log_extents: btrfs_release_path(path); btrfs_release_path(dst_path); if (need_log_inode_item) { - err = log_inode_item(trans, log, dst_path, inode, inode_item_dropped); - if (err) + ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped); + if (ret) goto out_unlock; /* * If we are doing a fast fsync and the inode was logged before @@ -5711,18 +5933,16 @@ log_extents: * BTRFS_INODE_COPY_EVERYTHING set. */ if (!xattrs_logged && inode->logged_trans < trans->transid) { - err = btrfs_log_all_xattrs(trans, inode, path, dst_path); - if (err) + ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); + if (ret) goto out_unlock; btrfs_release_path(path); } } if (fast_search) { ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx); - if (ret) { - err = ret; + if (ret) goto out_unlock; - } } else if (inode_only == LOG_INODE_ALL) { struct extent_map *em, *n; @@ -5734,10 +5954,8 @@ log_extents: if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { ret = log_directory_changes(trans, inode, path, dst_path, ctx); - if (ret) { - err = ret; + if (ret) goto out_unlock; - } } spin_lock(&inode->lock); @@ -5776,12 +5994,24 @@ log_extents: if (inode_only != LOG_INODE_EXISTS) inode->last_log_commit = inode->last_sub_trans; spin_unlock(&inode->lock); + + /* + * Reset the last_reflink_trans so that the next fsync does not need to + * go through the slower path when logging extents and their checksums. + */ + if (inode_only == LOG_INODE_ALL) + inode->last_reflink_trans = 0; + out_unlock: mutex_unlock(&inode->log_mutex); out: btrfs_free_path(path); btrfs_free_path(dst_path); - return err; + + if (recursive_logging) + ctx->logged_before = orig_logged_before; + + return ret; } /* @@ -5866,7 +6096,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_root *log = root->log_root; struct btrfs_path *path; LIST_HEAD(dir_list); struct btrfs_dir_list *dir_elem; @@ -5908,7 +6137,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, min_key.offset = 0; again: btrfs_release_path(path); - ret = btrfs_search_forward(log, &min_key, path, trans->transid); + ret = btrfs_search_forward(root, &min_key, path, trans->transid); if (ret < 0) { goto next_dir_inode; } else if (ret > 0) { @@ -5916,7 +6145,6 @@ again: goto next_dir_inode; } -process_leaf: leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); for (i = path->slots[0]; i < nritems; i++) { @@ -5934,8 +6162,7 @@ process_leaf: di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); type = btrfs_dir_type(leaf, di); - if (btrfs_dir_transid(leaf, di) < trans->transid && - type != BTRFS_FT_DIR) + if (btrfs_dir_transid(leaf, di) < trans->transid) continue; btrfs_dir_item_key_to_cpu(leaf, di, &di_key); if (di_key.type == BTRFS_ROOT_ITEM_KEY) @@ -5973,16 +6200,6 @@ process_leaf: } break; } - if (i == nritems) { - ret = btrfs_next_leaf(log, path); - if (ret < 0) { - goto next_dir_inode; - } else if (ret > 0) { - ret = 0; - goto next_dir_inode; - } - goto process_leaf; - } if (min_key.offset < (u64)-1) { min_key.offset++; goto again; @@ -6713,15 +6930,32 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, mutex_unlock(&dir->log_mutex); } -/* - * Call this after adding a new name for a file and it will properly - * update the log to reflect the new name. +/** + * Update the log after adding a new name for an inode. + * + * @trans: Transaction handle. + * @old_dentry: The dentry associated with the old name and the old + * parent directory. + * @old_dir: The inode of the previous parent directory for the case + * of a rename. For a link operation, it must be NULL. + * @old_dir_index: The index number associated with the old name, meaningful + * only for rename operations (when @old_dir is not NULL). + * Ignored for link operations. + * @parent: The dentry associated with the directory under which the + * new name is located. + * + * Call this after adding a new name for an inode, as a result of a link or + * rename operation, and it will properly update the log to reflect the new name. */ void btrfs_log_new_name(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, struct btrfs_inode *old_dir, - struct dentry *parent) + struct dentry *old_dentry, struct btrfs_inode *old_dir, + u64 old_dir_index, struct dentry *parent) { + struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry)); + struct btrfs_root *root = inode->root; struct btrfs_log_ctx ctx; + bool log_pinned = false; + int ret; /* * this will force the logging code to walk the dentry chain @@ -6734,26 +6968,83 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * if this inode hasn't been logged and directory we're renaming it * from hasn't been logged, we don't need to log it */ - if (!inode_logged(trans, inode) && - (!old_dir || !inode_logged(trans, old_dir))) - return; + ret = inode_logged(trans, inode, NULL); + if (ret < 0) { + goto out; + } else if (ret == 0) { + if (!old_dir) + return; + /* + * If the inode was not logged and we are doing a rename (old_dir is not + * NULL), check if old_dir was logged - if it was not we can return and + * do nothing. + */ + ret = inode_logged(trans, old_dir, NULL); + if (ret < 0) + goto out; + else if (ret == 0) + return; + } + ret = 0; /* * If we are doing a rename (old_dir is not NULL) from a directory that - * was previously logged, make sure the next log attempt on the directory - * is not skipped and logs the inode again. This is because the log may - * not currently be authoritative for a range including the old - * BTRFS_DIR_INDEX_KEY key, so we want to make sure after a log replay we - * do not end up with both the new and old dentries around (in case the - * inode is a directory we would have a directory with two hard links and - * 2 inode references for different parents). The next log attempt of - * old_dir will happen at btrfs_log_all_parents(), called through - * btrfs_log_inode_parent() below, because we have previously set - * inode->last_unlink_trans to the current transaction ID, either here or - * at btrfs_record_unlink_dir() in case the inode is a directory. + * was previously logged, make sure that on log replay we get the old + * dir entry deleted. This is needed because we will also log the new + * name of the renamed inode, so we need to make sure that after log + * replay we don't end up with both the new and old dir entries existing. */ - if (old_dir) - old_dir->logged_trans = 0; + if (old_dir && old_dir->logged_trans == trans->transid) { + struct btrfs_root *log = old_dir->root->log_root; + struct btrfs_path *path; + + ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX); + + /* + * We have two inodes to update in the log, the old directory and + * the inode that got renamed, so we must pin the log to prevent + * anyone from syncing the log until we have updated both inodes + * in the log. + */ + log_pinned = true; + btrfs_pin_log_trans(root); + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + /* + * Other concurrent task might be logging the old directory, + * as it can be triggered when logging other inode that had or + * still has a dentry in the old directory. So take the old + * directory's log_mutex to prevent getting an -EEXIST when + * logging a key to record the deletion, or having that other + * task logging the old directory get an -EEXIST if it attempts + * to log the same key after we just did it. In both cases that + * would result in falling back to a transaction commit. + */ + mutex_lock(&old_dir->log_mutex); + ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir), + old_dentry->d_name.name, + old_dentry->d_name.len, old_dir_index); + if (ret > 0) { + /* + * The dentry does not exist in the log, so record its + * deletion. + */ + btrfs_release_path(path); + ret = insert_dir_log_key(trans, log, path, + btrfs_ino(old_dir), + old_dir_index, old_dir_index); + } + mutex_unlock(&old_dir->log_mutex); + + btrfs_free_path(path); + if (ret < 0) + goto out; + } btrfs_init_log_ctx(&ctx, &inode->vfs_inode); ctx.logging_new_name = true; @@ -6765,5 +7056,16 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * inconsistent state after a rename operation. */ btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); +out: + /* + * If an error happened mark the log for a full commit because it's not + * consistent and up to date or we couldn't find out if one of the + * inodes was logged before in this transaction. Do it before unpinning + * the log, to avoid any races with someone else trying to commit it. + */ + if (ret < 0) + btrfs_set_log_full_commit(trans); + if (log_pinned) + btrfs_end_log_trans(root); } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index f6811c3df38a..1620f8170629 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -17,6 +17,8 @@ struct btrfs_log_ctx { int log_transid; bool log_new_dentries; bool logging_new_name; + /* Indicate if the inode being logged was logged before. */ + bool logged_before; /* Tracks the last logged dir item/index key offset. */ u64 last_dir_item_offset; struct inode *inode; @@ -32,6 +34,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, ctx->log_transid = 0; ctx->log_new_dentries = false; ctx->logging_new_name = false; + ctx->logged_before = false; ctx->inode = inode; INIT_LIST_HEAD(&ctx->list); INIT_LIST_HEAD(&ctx->ordered_extents); @@ -86,7 +89,7 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, struct btrfs_inode *dir); void btrfs_log_new_name(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, struct btrfs_inode *old_dir, - struct dentry *parent); + struct dentry *old_dentry, struct btrfs_inode *old_dir, + u64 old_dir_index, struct dentry *parent); #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b07d382d53a8..1be7cb2f955f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -534,30 +534,20 @@ error: return ret; } -static bool device_path_matched(const char *path, struct btrfs_device *device) -{ - int found; - - rcu_read_lock(); - found = strcmp(rcu_str_deref(device->name), path); - rcu_read_unlock(); - - return found == 0; -} - -/* - * Search and remove all stale (devices which are not mounted) devices. +/** + * Search and remove all stale devices (which are not mounted). * When both inputs are NULL, it will search and release all stale devices. - * path: Optional. When provided will it release all unmounted devices - * matching this path only. - * skip_dev: Optional. Will skip this device when searching for the stale + * + * @devt: Optional. When provided will it release all unmounted devices + * matching this devt only. + * @skip_device: Optional. Will skip this device when searching for the stale * devices. - * Return: 0 for success or if @path is NULL. - * -EBUSY if @path is a mounted device. - * -ENOENT if @path does not match any device in the list. + * + * Return: 0 for success or if @devt is 0. + * -EBUSY if @devt is a mounted device. + * -ENOENT if @devt does not match any device in the list. */ -static int btrfs_free_stale_devices(const char *path, - struct btrfs_device *skip_device) +static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device) { struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; struct btrfs_device *device, *tmp_device; @@ -565,7 +555,7 @@ static int btrfs_free_stale_devices(const char *path, lockdep_assert_held(&uuid_mutex); - if (path) + if (devt) ret = -ENOENT; list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { @@ -575,13 +565,11 @@ static int btrfs_free_stale_devices(const char *path, &fs_devices->devices, dev_list) { if (skip_device && skip_device == device) continue; - if (path && !device->name) - continue; - if (path && !device_path_matched(path, device)) + if (devt && devt != device->devt) continue; if (fs_devices->opened) { /* for an already deleted device return 0 */ - if (path && ret != 0) + if (devt && ret != 0) ret = -EBUSY; break; } @@ -614,7 +602,6 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device, fmode_t flags, void *holder) { - struct request_queue *q; struct block_device *bdev; struct btrfs_super_block *disk_super; u64 devid; @@ -656,8 +643,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); } - q = bdev_get_queue(bdev); - if (!blk_queue_nonrot(q)) + if (!blk_queue_nonrot(bdev_get_queue(bdev))) fs_devices->rotating = true; device->bdev = bdev; @@ -781,11 +767,17 @@ static noinline struct btrfs_device *device_list_add(const char *path, struct rcu_string *name; u64 found_transid = btrfs_super_generation(disk_super); u64 devid = btrfs_stack_device_id(&disk_super->dev_item); + dev_t path_devt; + int error; bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_METADATA_UUID); bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_CHANGING_FSID_V2); + error = lookup_bdev(path, &path_devt); + if (error) + return ERR_PTR(error); + if (fsid_change_in_progress) { if (!has_metadata_uuid) fs_devices = find_fsid_inprogress(disk_super); @@ -868,6 +860,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, return ERR_PTR(-ENOMEM); } rcu_assign_pointer(device->name, name); + device->devt = path_devt; list_add_rcu(&device->dev_list, &fs_devices->devices); fs_devices->num_devices++; @@ -928,25 +921,15 @@ static noinline struct btrfs_device *device_list_add(const char *path, /* * We are going to replace the device path for a given devid, * make sure it's the same device if the device is mounted + * + * NOTE: the device->fs_info may not be reliable here so pass + * in a NULL to message helpers instead. This avoids a possible + * use-after-free when the fs_info and fs_info->sb are already + * torn down. */ if (device->bdev) { - int error; - dev_t path_dev; - - error = lookup_bdev(path, &path_dev); - if (error) { - mutex_unlock(&fs_devices->device_list_mutex); - return ERR_PTR(error); - } - - if (device->bdev->bd_dev != path_dev) { + if (device->devt != path_devt) { mutex_unlock(&fs_devices->device_list_mutex); - /* - * device->fs_info may not be reliable here, so - * pass in a NULL instead. This avoids a - * possible use-after-free when the fs_info and - * fs_info->sb are already torn down. - */ btrfs_warn_in_rcu(NULL, "duplicate device %s devid %llu generation %llu scanned by %s (%d)", path, devid, found_transid, @@ -954,7 +937,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, task_pid_nr(current)); return ERR_PTR(-EEXIST); } - btrfs_info_in_rcu(device->fs_info, + btrfs_info_in_rcu(NULL, "devid %llu device path %s changed to %s scanned by %s (%d)", devid, rcu_str_deref(device->name), path, current->comm, @@ -972,6 +955,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, fs_devices->missing_devices--; clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); } + device->devt = path_devt; } /* @@ -1331,12 +1315,12 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev return disk_super; } -int btrfs_forget_devices(const char *path) +int btrfs_forget_devices(dev_t devt) { int ret; mutex_lock(&uuid_mutex); - ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL); + ret = btrfs_free_stale_devices(devt, NULL); mutex_unlock(&uuid_mutex); return ret; @@ -1385,10 +1369,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, } device = device_list_add(path, disk_super, &new_device_added); - if (!IS_ERR(device)) { - if (new_device_added) - btrfs_free_stale_devices(path, device); - } + if (!IS_ERR(device) && new_device_added) + btrfs_free_stale_devices(device->devt, device); btrfs_release_disk_super(disk_super); @@ -2102,6 +2084,11 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, u64 num_devices; int ret = 0; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, "device remove not supported on extent tree v2 yet"); + return -EINVAL; + } + /* * The device list in fs_devices is accessed without locks (neither * uuid_mutex nor device_list_mutex) as it won't change on a mounted @@ -2606,7 +2593,6 @@ error: int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) { struct btrfs_root *root = fs_info->dev_root; - struct request_queue *q; struct btrfs_trans_handle *trans; struct btrfs_device *device; struct block_device *bdev; @@ -2668,6 +2654,9 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path device->fs_info = fs_info; device->bdev = bdev; + ret = lookup_bdev(device_path, &device->devt); + if (ret) + goto error_free_device; ret = btrfs_get_dev_zone_info(device, false); if (ret) @@ -2679,7 +2668,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path goto error_free_zone; } - q = bdev_get_queue(bdev); set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); device->generation = trans->transid; device->io_width = fs_info->sectorsize; @@ -2727,7 +2715,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path atomic64_add(device->total_bytes, &fs_info->free_chunk_space); - if (!blk_queue_nonrot(q)) + if (!blk_queue_nonrot(bdev_get_queue(bdev))) fs_devices->rotating = true; orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); @@ -2814,7 +2802,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path * We can ignore the return value as it typically returns -EINVAL and * only succeeds if the device was an alien. */ - btrfs_forget_devices(device_path); + btrfs_forget_devices(device->devt); /* Update ctime/mtime for blkid or udev */ update_dev_time(device_path); @@ -3251,6 +3239,12 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) u64 length; int ret; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, + "relocate: not supported on extent tree v2 yet"); + return -EINVAL; + } + /* * Prevent races with automatic removal of unused block groups. * After we relocate and before we remove the chunk with offset @@ -7060,6 +7054,27 @@ static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info, } #endif +static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info, + u64 devid, u8 *uuid) +{ + struct btrfs_device *dev; + + if (!btrfs_test_opt(fs_info, DEGRADED)) { + btrfs_report_missing_device(fs_info, devid, uuid, true); + return ERR_PTR(-ENOENT); + } + + dev = add_missing_dev(fs_info->fs_devices, devid, uuid); + if (IS_ERR(dev)) { + btrfs_err(fs_info, "failed to init missing device %llu: %ld", + devid, PTR_ERR(dev)); + return dev; + } + btrfs_report_missing_device(fs_info, devid, uuid, false); + + return dev; +} + static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, struct btrfs_chunk *chunk) { @@ -7147,28 +7162,17 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, BTRFS_UUID_SIZE); args.uuid = uuid; map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args); - if (!map->stripes[i].dev && - !btrfs_test_opt(fs_info, DEGRADED)) { - free_extent_map(em); - btrfs_report_missing_device(fs_info, devid, uuid, true); - return -ENOENT; - } if (!map->stripes[i].dev) { - map->stripes[i].dev = - add_missing_dev(fs_info->fs_devices, devid, - uuid); + map->stripes[i].dev = handle_missing_device(fs_info, + devid, uuid); if (IS_ERR(map->stripes[i].dev)) { free_extent_map(em); - btrfs_err(fs_info, - "failed to init missing dev %llu: %ld", - devid, PTR_ERR(map->stripes[i].dev)); return PTR_ERR(map->stripes[i].dev); } - btrfs_report_missing_device(fs_info, devid, uuid, false); } + set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &(map->stripes[i].dev->dev_state)); - } write_lock(&map_tree->lock); @@ -8299,10 +8303,12 @@ static int relocating_repair_kthread(void *data) target = cache->start; btrfs_put_block_group(cache); + sb_start_write(fs_info->sb); if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { btrfs_info(fs_info, "zoned: skip relocating block group %llu to repair: EBUSY", target); + sb_end_write(fs_info->sb); return -EBUSY; } @@ -8330,6 +8336,7 @@ out: btrfs_put_block_group(cache); mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_exclop_finish(fs_info); + sb_end_write(fs_info->sb); return ret; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 005c9e2a491a..bd297f23d19e 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -72,6 +72,11 @@ struct btrfs_device { /* the mode sent to blkdev_get */ fmode_t mode; + /* + * Device's major-minor number. Must be set even if the device is not + * opened (bdev == NULL), unless the device is missing. + */ + dev_t devt; unsigned long dev_state; blk_status_t last_flush_error; @@ -505,7 +510,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, void *holder); -int btrfs_forget_devices(const char *path); +int btrfs_forget_devices(dev_t devt); void btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices); void btrfs_assign_next_active_device(struct btrfs_device *device, diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index f559d517c7c4..b7b5fac1c779 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -652,8 +652,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) if (model == BLK_ZONED_HM || (model == BLK_ZONED_HA && incompat_zoned) || (model == BLK_ZONED_NONE && incompat_zoned)) { - struct btrfs_zoned_device_info *zone_info = - device->zone_info; + struct btrfs_zoned_device_info *zone_info; zone_info = device->zone_info; zoned_devices++; @@ -1215,12 +1214,12 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) struct btrfs_device *device; u64 logical = cache->start; u64 length = cache->length; - u64 physical = 0; int ret; int i; unsigned int nofs_flag; u64 *alloc_offsets = NULL; u64 *caps = NULL; + u64 *physical = NULL; unsigned long *active = NULL; u64 last_alloc = 0; u32 num_sequential = 0, num_conventional = 0; @@ -1264,6 +1263,12 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } + physical = kcalloc(map->num_stripes, sizeof(*physical), GFP_NOFS); + if (!physical) { + ret = -ENOMEM; + goto out; + } + active = bitmap_zalloc(map->num_stripes, GFP_NOFS); if (!active) { ret = -ENOMEM; @@ -1277,14 +1282,14 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) int dev_replace_is_ongoing = 0; device = map->stripes[i].dev; - physical = map->stripes[i].physical; + physical[i] = map->stripes[i].physical; if (device->bdev == NULL) { alloc_offsets[i] = WP_MISSING_DEV; continue; } - is_sequential = btrfs_dev_is_sequential(device, physical); + is_sequential = btrfs_dev_is_sequential(device, physical[i]); if (is_sequential) num_sequential++; else @@ -1299,21 +1304,21 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) * This zone will be used for allocation, so mark this zone * non-empty. */ - btrfs_dev_clear_zone_empty(device, physical); + btrfs_dev_clear_zone_empty(device, physical[i]); down_read(&dev_replace->rwsem); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) - btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical); + btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical[i]); up_read(&dev_replace->rwsem); /* * The group is mapped to a sequential zone. Get the zone write * pointer to determine the allocation offset within the zone. */ - WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size)); + WARN_ON(!IS_ALIGNED(physical[i], fs_info->zone_size)); nofs_flag = memalloc_nofs_save(); - ret = btrfs_get_dev_zone(device, physical, &zone); + ret = btrfs_get_dev_zone(device, physical[i], &zone); memalloc_nofs_restore(nofs_flag); if (ret == -EIO || ret == -EOPNOTSUPP) { ret = 0; @@ -1339,7 +1344,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) case BLK_ZONE_COND_READONLY: btrfs_err(fs_info, "zoned: offline/readonly zone %llu on device %s (devid %llu)", - physical >> device->zone_info->zone_size_shift, + physical[i] >> device->zone_info->zone_size_shift, rcu_str_deref(device->name), device->devid); alloc_offsets[i] = WP_MISSING_DEV; break; @@ -1404,7 +1409,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) if (alloc_offsets[0] == WP_MISSING_DEV) { btrfs_err(fs_info, "zoned: cannot recover write pointer for zone %llu", - physical); + physical[0]); ret = -EIO; goto out; } @@ -1413,6 +1418,42 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) cache->zone_is_active = test_bit(0, active); break; case BTRFS_BLOCK_GROUP_DUP: + if (map->type & BTRFS_BLOCK_GROUP_DATA) { + btrfs_err(fs_info, "zoned: profile DUP not yet supported on data bg"); + ret = -EINVAL; + goto out; + } + if (alloc_offsets[0] == WP_MISSING_DEV) { + btrfs_err(fs_info, + "zoned: cannot recover write pointer for zone %llu", + physical[0]); + ret = -EIO; + goto out; + } + if (alloc_offsets[1] == WP_MISSING_DEV) { + btrfs_err(fs_info, + "zoned: cannot recover write pointer for zone %llu", + physical[1]); + ret = -EIO; + goto out; + } + if (alloc_offsets[0] != alloc_offsets[1]) { + btrfs_err(fs_info, + "zoned: write pointer offset mismatch of zones in DUP profile"); + ret = -EIO; + goto out; + } + if (test_bit(0, active) != test_bit(1, active)) { + if (!btrfs_zone_activate(cache)) { + ret = -EIO; + goto out; + } + } else { + cache->zone_is_active = test_bit(0, active); + } + cache->alloc_offset = alloc_offsets[0]; + cache->zone_capacity = min(caps[0], caps[1]); + break; case BTRFS_BLOCK_GROUP_RAID1: case BTRFS_BLOCK_GROUP_RAID0: case BTRFS_BLOCK_GROUP_RAID10: @@ -1465,6 +1506,7 @@ out: cache->physical_map = NULL; } bitmap_free(active); + kfree(physical); kfree(caps); kfree(alloc_offsets); free_extent_map(em); @@ -1781,50 +1823,55 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) struct btrfs_device *device; u64 physical; bool ret; + int i; if (!btrfs_is_zoned(block_group->fs_info)) return true; map = block_group->physical_map; - /* Currently support SINGLE profile only */ - ASSERT(map->num_stripes == 1); - device = map->stripes[0].dev; - physical = map->stripes[0].physical; - - if (device->zone_info->max_active_zones == 0) - return true; spin_lock(&block_group->lock); - if (block_group->zone_is_active) { ret = true; goto out_unlock; } - /* No space left */ - if (block_group->alloc_offset == block_group->zone_capacity) { - ret = false; - goto out_unlock; - } + for (i = 0; i < map->num_stripes; i++) { + device = map->stripes[i].dev; + physical = map->stripes[i].physical; - if (!btrfs_dev_set_active_zone(device, physical)) { - /* Cannot activate the zone */ - ret = false; - goto out_unlock; - } + if (device->zone_info->max_active_zones == 0) + continue; + + /* No space left */ + if (block_group->alloc_offset == block_group->zone_capacity) { + ret = false; + goto out_unlock; + } + + if (!btrfs_dev_set_active_zone(device, physical)) { + /* Cannot activate the zone */ + ret = false; + goto out_unlock; + } + + /* Successfully activated all the zones */ + if (i == map->num_stripes - 1) + block_group->zone_is_active = 1; - /* Successfully activated all the zones */ - block_group->zone_is_active = 1; + } spin_unlock(&block_group->lock); - /* For the active block group list */ - btrfs_get_block_group(block_group); + if (block_group->zone_is_active) { + /* For the active block group list */ + btrfs_get_block_group(block_group); - spin_lock(&fs_info->zone_active_bgs_lock); - ASSERT(list_empty(&block_group->active_bg_list)); - list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs); - spin_unlock(&fs_info->zone_active_bgs_lock); + spin_lock(&fs_info->zone_active_bgs_lock); + list_add_tail(&block_group->active_bg_list, + &fs_info->zone_active_bgs); + spin_unlock(&fs_info->zone_active_bgs_lock); + } return true; @@ -1840,19 +1887,12 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group) struct btrfs_device *device; u64 physical; int ret = 0; + int i; if (!btrfs_is_zoned(fs_info)) return 0; map = block_group->physical_map; - /* Currently support SINGLE profile only */ - ASSERT(map->num_stripes == 1); - - device = map->stripes[0].dev; - physical = map->stripes[0].physical; - - if (device->zone_info->max_active_zones == 0) - return 0; spin_lock(&block_group->lock); if (!block_group->zone_is_active) { @@ -1904,25 +1944,34 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group) btrfs_clear_data_reloc_bg(block_group); spin_unlock(&block_group->lock); - ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, - physical >> SECTOR_SHIFT, - device->zone_info->zone_size >> SECTOR_SHIFT, - GFP_NOFS); - btrfs_dec_block_group_ro(block_group); + for (i = 0; i < map->num_stripes; i++) { + device = map->stripes[i].dev; + physical = map->stripes[i].physical; - if (!ret) { - btrfs_dev_clear_active_zone(device, physical); + if (device->zone_info->max_active_zones == 0) + continue; - spin_lock(&fs_info->zone_active_bgs_lock); - ASSERT(!list_empty(&block_group->active_bg_list)); - list_del_init(&block_group->active_bg_list); - spin_unlock(&fs_info->zone_active_bgs_lock); + ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, + physical >> SECTOR_SHIFT, + device->zone_info->zone_size >> SECTOR_SHIFT, + GFP_NOFS); - /* For active_bg_list */ - btrfs_put_block_group(block_group); + if (ret) + return ret; + + btrfs_dev_clear_active_zone(device, physical); } + btrfs_dec_block_group_ro(block_group); - return ret; + spin_lock(&fs_info->zone_active_bgs_lock); + ASSERT(!list_empty(&block_group->active_bg_list)); + list_del_init(&block_group->active_bg_list); + spin_unlock(&fs_info->zone_active_bgs_lock); + + /* For active_bg_list */ + btrfs_put_block_group(block_group); + + return 0; } bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) |