diff options
Diffstat (limited to 'fs')
50 files changed, 2754 insertions, 4717 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 37b6bab90c83..66fa9ab2c046 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -2,6 +2,7 @@ config BTRFS_FS tristate "Btrfs filesystem support" + select BLK_CGROUP_PUNT_BIO select CRYPTO select CRYPTO_CRC32C select LIBCRC32C diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 726592868e9c..5379c4714905 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -31,11 +31,11 @@ struct btrfs_failed_bio { * Initialize a btrfs_bio structure. This skips the embedded bio itself as it * is already initialized by the block layer. */ -void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, +void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, btrfs_bio_end_io_t end_io, void *private) { memset(bbio, 0, offsetof(struct btrfs_bio, bio)); - bbio->inode = inode; + bbio->fs_info = fs_info; bbio->end_io = end_io; bbio->private = private; atomic_set(&bbio->pending_ios, 1); @@ -48,41 +48,58 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, * Just like the underlying bio_alloc_bioset it will not fail as it is backed by * a mempool. */ -struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, - struct btrfs_inode *inode, - btrfs_bio_end_io_t end_io, void *private) +struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, + struct btrfs_fs_info *fs_info, + btrfs_bio_end_io_t end_io, void *private) { + struct btrfs_bio *bbio; struct bio *bio; bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); - btrfs_bio_init(btrfs_bio(bio), inode, end_io, private); - return bio; + bbio = btrfs_bio(bio); + btrfs_bio_init(bbio, fs_info, end_io, private); + return bbio; } -static struct bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, - struct bio *orig, u64 map_length, - bool use_append) +static blk_status_t btrfs_bio_extract_ordered_extent(struct btrfs_bio *bbio) { - struct btrfs_bio *orig_bbio = btrfs_bio(orig); + struct btrfs_ordered_extent *ordered; + int ret; + + ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset); + if (WARN_ON_ONCE(!ordered)) + return BLK_STS_IOERR; + ret = btrfs_extract_ordered_extent(bbio, ordered); + btrfs_put_ordered_extent(ordered); + + return errno_to_blk_status(ret); +} + +static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, + struct btrfs_bio *orig_bbio, + u64 map_length, bool use_append) +{ + struct btrfs_bio *bbio; struct bio *bio; if (use_append) { unsigned int nr_segs; - bio = bio_split_rw(orig, &fs_info->limits, &nr_segs, + bio = bio_split_rw(&orig_bbio->bio, &fs_info->limits, &nr_segs, &btrfs_clone_bioset, map_length); } else { - bio = bio_split(orig, map_length >> SECTOR_SHIFT, GFP_NOFS, - &btrfs_clone_bioset); + bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, + GFP_NOFS, &btrfs_clone_bioset); } - btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode, NULL, orig_bbio); - - btrfs_bio(bio)->file_offset = orig_bbio->file_offset; - if (!(orig->bi_opf & REQ_BTRFS_ONE_ORDERED)) + bbio = btrfs_bio(bio); + btrfs_bio_init(bbio, fs_info, NULL, orig_bbio); + bbio->inode = orig_bbio->inode; + bbio->file_offset = orig_bbio->file_offset; + if (!(orig_bbio->bio.bi_opf & REQ_BTRFS_ONE_ORDERED)) orig_bbio->file_offset += map_length; atomic_inc(&orig_bbio->pending_ios); - return bio; + return bbio; } static void btrfs_orig_write_end_io(struct bio *bio); @@ -164,7 +181,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, goto done; } - btrfs_submit_bio(&repair_bbio->bio, mirror); + btrfs_submit_bio(repair_bbio, mirror); return; } @@ -224,15 +241,16 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, &btrfs_repair_bioset); repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector; - bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset); + __bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset); repair_bbio = btrfs_bio(repair_bio); - btrfs_bio_init(repair_bbio, failed_bbio->inode, NULL, fbio); + btrfs_bio_init(repair_bbio, fs_info, NULL, fbio); + repair_bbio->inode = failed_bbio->inode; repair_bbio->file_offset = failed_bbio->file_offset + bio_offset; mirror = next_repair_mirror(fbio, failed_bbio->mirror_num); btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror); - btrfs_submit_bio(repair_bio, mirror); + btrfs_submit_bio(repair_bbio, mirror); return fbio; } @@ -246,6 +264,9 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de struct btrfs_failed_bio *fbio = NULL; u32 offset = 0; + /* Read-repair requires the inode field to be set by the submitter. */ + ASSERT(inode); + /* * Hand off repair bios to the repair code as there is no upper level * submitter for them. @@ -306,17 +327,17 @@ static void btrfs_end_bio_work(struct work_struct *work) struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); /* Metadata reads are checked and repaired by the submitter. */ - if (bbio->bio.bi_opf & REQ_META) - bbio->end_io(bbio); - else + if (bbio->inode && !(bbio->bio.bi_opf & REQ_META)) btrfs_check_read_bio(bbio, bbio->bio.bi_private); + else + bbio->end_io(bbio); } static void btrfs_simple_end_io(struct bio *bio) { struct btrfs_bio *bbio = btrfs_bio(bio); struct btrfs_device *dev = bio->bi_private; - struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; + struct btrfs_fs_info *fs_info = bbio->fs_info; btrfs_bio_counter_dec(fs_info); @@ -340,7 +361,8 @@ static void btrfs_raid56_end_io(struct bio *bio) btrfs_bio_counter_dec(bioc->fs_info); bbio->mirror_num = bioc->mirror_num; - if (bio_op(bio) == REQ_OP_READ && !(bbio->bio.bi_opf & REQ_META)) + if (bio_op(bio) == REQ_OP_READ && bbio->inode && + !(bbio->bio.bi_opf & REQ_META)) btrfs_check_read_bio(bbio, NULL); else btrfs_orig_bbio_end_io(bbio); @@ -418,7 +440,11 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) dev->devid, bio->bi_iter.bi_size); btrfsic_check_bio(bio); - submit_bio(bio); + + if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT) + blkcg_punt_bio_submit(bio); + else + submit_bio(bio); } static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) @@ -534,10 +560,10 @@ static void run_one_async_done(struct btrfs_work *work) /* * All of the bios that pass through here are from async helpers. - * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context. - * This changes nothing when cgroups aren't in use. + * Use REQ_BTRFS_CGROUP_PUNT to issue them from the owning cgroup's + * context. This changes nothing when cgroups aren't in use. */ - bio->bi_opf |= REQ_CGROUP_PUNT; + bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT; __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); } @@ -562,7 +588,7 @@ static bool should_async_write(struct btrfs_bio *bbio) * in order. */ if (bbio->bio.bi_opf & REQ_META) { - struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; + struct btrfs_fs_info *fs_info = bbio->fs_info; if (btrfs_is_zoned(fs_info)) return false; @@ -582,7 +608,7 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, struct btrfs_io_context *bioc, struct btrfs_io_stripe *smap, int mirror_num) { - struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; + struct btrfs_fs_info *fs_info = bbio->fs_info; struct async_submit_bio *async; async = kmalloc(sizeof(*async), GFP_NOFS); @@ -603,12 +629,12 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, return true; } -static bool btrfs_submit_chunk(struct bio *bio, int mirror_num) +static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) { - struct btrfs_bio *bbio = btrfs_bio(bio); struct btrfs_inode *inode = bbio->inode; - struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_fs_info *fs_info = bbio->fs_info; struct btrfs_bio *orig_bbio = bbio; + struct bio *bio = &bbio->bio; u64 logical = bio->bi_iter.bi_sector << 9; u64 length = bio->bi_iter.bi_size; u64 map_length = length; @@ -631,15 +657,15 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num) map_length = min(map_length, fs_info->max_zone_append_size); if (map_length < length) { - bio = btrfs_split_bio(fs_info, bio, map_length, use_append); - bbio = btrfs_bio(bio); + bbio = btrfs_split_bio(fs_info, bbio, map_length, use_append); + bio = &bbio->bio; } /* * Save the iter for the end_io handler and preload the checksums for * data reads. */ - if (bio_op(bio) == REQ_OP_READ && !(bio->bi_opf & REQ_META)) { + if (bio_op(bio) == REQ_OP_READ && inode && !(bio->bi_opf & REQ_META)) { bbio->saved_iter = bio->bi_iter; ret = btrfs_lookup_bio_sums(bbio); if (ret) @@ -650,7 +676,7 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num) if (use_append) { bio->bi_opf &= ~REQ_OP_WRITE; bio->bi_opf |= REQ_OP_ZONE_APPEND; - ret = btrfs_extract_ordered_extent(btrfs_bio(bio)); + ret = btrfs_bio_extract_ordered_extent(bbio); if (ret) goto fail_put_bio; } @@ -659,7 +685,7 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num) * Csum items for reloc roots have already been cloned at this * point, so they are handled as part of the no-checksum case. */ - if (!(inode->flags & BTRFS_INODE_NODATASUM) && + if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) && !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && !btrfs_is_data_reloc_root(inode->root)) { if (should_async_write(bbio) && @@ -686,9 +712,12 @@ fail: return true; } -void btrfs_submit_bio(struct bio *bio, int mirror_num) +void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num) { - while (!btrfs_submit_chunk(bio, mirror_num)) + /* If bbio->inode is not populated, its file_offset must be 0. */ + ASSERT(bbio->inode || bbio->file_offset == 0); + + while (!btrfs_submit_chunk(bbio, mirror_num)) ; } @@ -706,12 +735,9 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, u64 length, u64 logical, struct page *page, unsigned int pg_offset, int mirror_num) { - struct btrfs_device *dev; + struct btrfs_io_stripe smap = { 0 }; struct bio_vec bvec; struct bio bio; - u64 map_length = 0; - u64 sector; - struct btrfs_io_context *bioc = NULL; int ret = 0; ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); @@ -720,68 +746,38 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, if (btrfs_repair_one_zone(fs_info, logical)) return 0; - map_length = length; - /* * Avoid races with device replace and make sure our bioc has devices * associated to its stripes that don't go away while we are doing the * read repair operation. */ btrfs_bio_counter_inc_blocked(fs_info); - if (btrfs_is_parity_mirror(fs_info, logical, length)) { - /* - * Note that we don't use BTRFS_MAP_WRITE because it's supposed - * to update all raid stripes, but here we just want to correct - * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad - * stripe's dev and sector. - */ - ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, - &map_length, &bioc, 0); - if (ret) - goto out_counter_dec; - ASSERT(bioc->mirror_num == 1); - } else { - ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, - &map_length, &bioc, mirror_num); - if (ret) - goto out_counter_dec; - /* - * This happens when dev-replace is also running, and the - * mirror_num indicates the dev-replace target. - * - * In this case, we don't need to do anything, as the read - * error just means the replace progress hasn't reached our - * read range, and later replace routine would handle it well. - */ - if (mirror_num != bioc->mirror_num) - goto out_counter_dec; - } - - sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; - dev = bioc->stripes[bioc->mirror_num - 1].dev; - btrfs_put_bioc(bioc); + ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num); + if (ret < 0) + goto out_counter_dec; - if (!dev || !dev->bdev || - !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { + if (!smap.dev->bdev || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state)) { ret = -EIO; goto out_counter_dec; } - bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); - bio.bi_iter.bi_sector = sector; + bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); + bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; __bio_add_page(&bio, page, length, pg_offset); btrfsic_check_bio(&bio); ret = submit_bio_wait(&bio); if (ret) { /* try to remap that extent elsewhere? */ - btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); + btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS); goto out_bio_uninit; } btrfs_info_rl_in_rcu(fs_info, "read error corrected: ino %llu off %llu (dev %s sector %llu)", - ino, start, btrfs_dev_name(dev), sector); + ino, start, btrfs_dev_name(smap.dev), + smap.physical >> SECTOR_SHIFT); ret = 0; out_bio_uninit: @@ -791,6 +787,45 @@ out_counter_dec: return ret; } +/* + * Submit a btrfs_bio based repair write. + * + * If @dev_replace is true, the write would be submitted to dev-replace target. + */ +void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace) +{ + struct btrfs_fs_info *fs_info = bbio->fs_info; + u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; + u64 length = bbio->bio.bi_iter.bi_size; + struct btrfs_io_stripe smap = { 0 }; + int ret; + + ASSERT(fs_info); + ASSERT(mirror_num > 0); + ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE); + ASSERT(!bbio->inode); + + btrfs_bio_counter_inc_blocked(fs_info); + ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num); + if (ret < 0) + goto fail; + + if (dev_replace) { + if (btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE && btrfs_is_zoned(fs_info)) { + bbio->bio.bi_opf &= ~REQ_OP_WRITE; + bbio->bio.bi_opf |= REQ_OP_ZONE_APPEND; + } + ASSERT(smap.dev == fs_info->dev_replace.srcdev); + smap.dev = fs_info->dev_replace.tgtdev; + } + __btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num); + return; + +fail: + btrfs_bio_counter_dec(fs_info); + btrfs_bio_end_io(bbio, errno_to_blk_status(ret)); +} + int __init btrfs_bioset_init(void) { if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index 873ff85817f0..a8eca3a65673 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -30,7 +30,10 @@ typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); * passed to btrfs_submit_bio for mapping to the physical devices. */ struct btrfs_bio { - /* Inode and offset into it that this I/O operates on. */ + /* + * Inode and offset into it that this I/O operates on. + * Only set for data I/O. + */ struct btrfs_inode *inode; u64 file_offset; @@ -58,6 +61,9 @@ struct btrfs_bio { atomic_t pending_ios; struct work_struct end_io_work; + /* File system that this I/O operates on. */ + struct btrfs_fs_info *fs_info; + /* * This member must come last, bio_alloc_bioset will allocate enough * bytes for entire btrfs_bio but relies on bio being last. @@ -73,11 +79,11 @@ static inline struct btrfs_bio *btrfs_bio(struct bio *bio) int __init btrfs_bioset_init(void); void __cold btrfs_bioset_exit(void); -void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, +void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, btrfs_bio_end_io_t end_io, void *private); -struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, - struct btrfs_inode *inode, - btrfs_bio_end_io_t end_io, void *private); +struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, + struct btrfs_fs_info *fs_info, + btrfs_bio_end_io_t end_io, void *private); static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) { @@ -88,7 +94,11 @@ static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) /* Bio only refers to one ordered extent. */ #define REQ_BTRFS_ONE_ORDERED REQ_DRV -void btrfs_submit_bio(struct bio *bio, int mirror_num); +/* Submit using blkcg_punt_bio_submit. */ +#define REQ_BTRFS_CGROUP_PUNT REQ_FS_PRIVATE + +void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num); +void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace); int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, u64 length, u64 logical, struct page *page, unsigned int pg_offset, int mirror_num); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 5fc670c27f86..957ad1c31c4f 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -160,15 +160,6 @@ void btrfs_put_block_group(struct btrfs_block_group *cache) btrfs_discard_cancel_work(&cache->fs_info->discard_ctl, cache); - /* - * If not empty, someone is still holding mutex of - * full_stripe_lock, which can only be released by caller. - * And it will definitely cause use-after-free when caller - * tries to release full stripe lock. - * - * No better way to resolve, but only to warn. - */ - WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root)); kfree(cache->free_space_ctl); kfree(cache->physical_map); kfree(cache); @@ -1977,12 +1968,12 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, map = em->map_lookup; data_stripe_length = em->orig_block_len; - io_stripe_size = map->stripe_len; + io_stripe_size = BTRFS_STRIPE_LEN; chunk_start = em->start; /* For RAID5/6 adjust to a full IO stripe length */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) - io_stripe_size = map->stripe_len * nr_data_stripes(map); + io_stripe_size = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT; buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); if (!buf) { @@ -1992,28 +1983,28 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, for (i = 0; i < map->num_stripes; i++) { bool already_inserted = false; - u64 stripe_nr; - u64 offset; + u32 stripe_nr; + u32 offset; int j; if (!in_range(physical, map->stripes[i].physical, data_stripe_length)) continue; - stripe_nr = physical - map->stripes[i].physical; - stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset); + stripe_nr = (physical - map->stripes[i].physical) >> + BTRFS_STRIPE_LEN_SHIFT; + offset = (physical - map->stripes[i].physical) & + BTRFS_STRIPE_LEN_MASK; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID10)) { - stripe_nr = stripe_nr * map->num_stripes + i; - stripe_nr = div_u64(stripe_nr, map->sub_stripes); - } + BTRFS_BLOCK_GROUP_RAID10)) + stripe_nr = div_u64(stripe_nr * map->num_stripes + i, + map->sub_stripes); /* * The remaining case would be for RAID56, multiply by * nr_data_stripes(). Alternatively, just use rmap_len below * instead of map->stripe_len */ - bytenr = chunk_start + stripe_nr * io_stripe_size + offset; /* Ensure we don't add duplicate addresses */ @@ -2124,8 +2115,6 @@ static struct btrfs_block_group *btrfs_create_block_group_cache( btrfs_init_free_space_ctl(cache, cache->free_space_ctl); atomic_set(&cache->frozen, 0); mutex_init(&cache->free_space_lock); - cache->full_stripe_locks_root.root = RB_ROOT; - mutex_init(&cache->full_stripe_locks_root.lock); return cache; } @@ -2672,7 +2661,7 @@ static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset) } struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, - u64 bytes_used, u64 type, + u64 type, u64 chunk_offset, u64 size) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -2687,7 +2676,6 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran cache->length = size; set_free_space_tree_thresholds(cache); - cache->used = bytes_used; cache->flags = type; cache->cached = BTRFS_CACHE_FINISHED; cache->global_root_id = calculate_global_root_id(fs_info, cache->start); @@ -2738,9 +2726,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran #ifdef CONFIG_BTRFS_DEBUG if (btrfs_should_fragment_free_space(cache)) { - u64 new_bytes_used = size - bytes_used; - - cache->space_info->bytes_used += new_bytes_used >> 1; + cache->space_info->bytes_used += size >> 1; fragment_free_space(cache); } #endif diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 6e4a0b429ac3..cc0e4b37db2d 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -91,14 +91,6 @@ struct btrfs_caching_control { /* Once caching_thread() finds this much free space, it will wake up waiters. */ #define CACHING_CTL_WAKE_UP SZ_2M -/* - * Tree to record all locked full stripes of a RAID5/6 block group - */ -struct btrfs_full_stripe_locks_tree { - struct rb_root root; - struct mutex lock; -}; - struct btrfs_block_group { struct btrfs_fs_info *fs_info; struct inode *inode; @@ -229,9 +221,6 @@ struct btrfs_block_group { */ int swap_extents; - /* Record locked full stripes for RAID5/6 block group */ - struct btrfs_full_stripe_locks_tree full_stripe_locks_root; - /* * Allocation offset for the block group to implement sequential * allocation. This is used only on a zoned filesystem. @@ -302,7 +291,7 @@ void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info); void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg); int btrfs_read_block_groups(struct btrfs_fs_info *info); struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, - u64 bytes_used, u64 type, + u64 type, u64 chunk_offset, u64 size); void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans); int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 5367a14d44d2..3ab707e26fa2 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -232,9 +232,6 @@ int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent) u64 num_bytes = 0; int ret = -ENOSPC; - if (!block_rsv) - return 0; - spin_lock(&block_rsv->lock); num_bytes = mult_perc(block_rsv->size, min_percent); if (block_rsv->reserved >= num_bytes) @@ -245,17 +242,15 @@ int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent) } int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, u64 min_reserved, + struct btrfs_block_rsv *block_rsv, u64 num_bytes, enum btrfs_reserve_flush_enum flush) { - u64 num_bytes = 0; int ret = -ENOSPC; if (!block_rsv) return 0; spin_lock(&block_rsv->lock); - num_bytes = min_reserved; if (block_rsv->reserved >= num_bytes) ret = 0; else @@ -355,17 +350,19 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) /* * But we also want to reserve enough space so we can do the fallback - * global reserve for an unlink, which is an additional 5 items (see the - * comment in __unlink_start_trans for what we're modifying.) + * global reserve for an unlink, which is an additional + * BTRFS_UNLINK_METADATA_UNITS items. * * But we also need space for the delayed ref updates from the unlink, - * so its 10, 5 for the actual operation, and 5 for the delayed ref - * updates. + * so add BTRFS_UNLINK_METADATA_UNITS units for delayed refs, one for + * each unlink metadata item. */ - min_items += 10; + min_items += BTRFS_UNLINK_METADATA_UNITS; num_bytes = max_t(u64, num_bytes, - btrfs_calc_insert_metadata_size(fs_info, min_items)); + btrfs_calc_insert_metadata_size(fs_info, min_items) + + btrfs_calc_delayed_ref_bytes(fs_info, + BTRFS_UNLINK_METADATA_UNITS)); spin_lock(&sinfo->lock); spin_lock(&block_rsv->lock); diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 4cc41c9aaa82..6dc781709aca 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -65,7 +65,7 @@ int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush); int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent); int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, u64 min_reserved, + struct btrfs_block_rsv *block_rsv, u64 num_bytes, enum btrfs_reserve_flush_enum flush); int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes, diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 9dc21622806e..ec2ae4406c16 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -142,11 +142,22 @@ struct btrfs_inode { /* a local copy of root's last_log_commit */ int last_log_commit; - /* - * Total number of bytes pending delalloc, used by stat to calculate the - * real block usage of the file. This is used only for files. - */ - u64 delalloc_bytes; + union { + /* + * Total number of bytes pending delalloc, used by stat to + * calculate the real block usage of the file. This is used + * only for files. + */ + u64 delalloc_bytes; + /* + * The lowest possible index of the next dir index key which + * points to an inode that needs to be logged. + * This is used only for directories. + * Use the helpers btrfs_get_first_dir_index_to_log() and + * btrfs_set_first_dir_index_to_log() to access this field. + */ + u64 first_dir_index_to_log; + }; union { /* @@ -247,6 +258,17 @@ struct btrfs_inode { struct inode vfs_inode; }; +static inline u64 btrfs_get_first_dir_index_to_log(const struct btrfs_inode *inode) +{ + return READ_ONCE(inode->first_dir_index_to_log); +} + +static inline void btrfs_set_first_dir_index_to_log(struct btrfs_inode *inode, + u64 index) +{ + WRITE_ONCE(inode->first_dir_index_to_log, index); +} + static inline struct btrfs_inode *BTRFS_I(const struct inode *inode) { return container_of(inode, struct btrfs_inode, vfs_inode); @@ -407,7 +429,8 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags, int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, u32 pgoff, u8 *csum, const u8 * const csum_expected); -blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio); +int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, + struct btrfs_ordered_extent *ordered); bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, u32 bio_offset, struct bio_vec *bv); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index f42f31f22d13..2d0493f0a184 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -37,6 +37,8 @@ #include "file-item.h" #include "super.h" +struct bio_set btrfs_compressed_bioset; + static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; const char* btrfs_compress_type2str(enum btrfs_compression_type type) @@ -54,6 +56,25 @@ const char* btrfs_compress_type2str(enum btrfs_compression_type type) return NULL; } +static inline struct compressed_bio *to_compressed_bio(struct btrfs_bio *bbio) +{ + return container_of(bbio, struct compressed_bio, bbio); +} + +static struct compressed_bio *alloc_compressed_bio(struct btrfs_inode *inode, + u64 start, blk_opf_t op, + btrfs_bio_end_io_t end_io) +{ + struct btrfs_bio *bbio; + + bbio = btrfs_bio(bio_alloc_bioset(NULL, BTRFS_MAX_COMPRESSED_PAGES, op, + GFP_NOFS, &btrfs_compressed_bioset)); + btrfs_bio_init(bbio, inode->root->fs_info, end_io, NULL); + bbio->inode = inode; + bbio->file_offset = start; + return to_compressed_bio(bbio); +} + bool btrfs_compress_is_valid_type(const char *str, size_t len) { int i; @@ -139,32 +160,25 @@ static int compression_decompress(int type, struct list_head *ws, } } +static void btrfs_free_compressed_pages(struct compressed_bio *cb) +{ + for (unsigned int i = 0; i < cb->nr_pages; i++) + put_page(cb->compressed_pages[i]); + kfree(cb->compressed_pages); +} + static int btrfs_decompress_bio(struct compressed_bio *cb); static void end_compressed_bio_read(struct btrfs_bio *bbio) { - struct compressed_bio *cb = bbio->private; - unsigned int index; - struct page *page; + struct compressed_bio *cb = to_compressed_bio(bbio); + blk_status_t status = bbio->bio.bi_status; - if (bbio->bio.bi_status) - cb->status = bbio->bio.bi_status; - else - cb->status = errno_to_blk_status(btrfs_decompress_bio(cb)); - - /* Release the compressed pages */ - for (index = 0; index < cb->nr_pages; index++) { - page = cb->compressed_pages[index]; - page->mapping = NULL; - put_page(page); - } - - /* Do io completion on the original bio */ - btrfs_bio_end_io(btrfs_bio(cb->orig_bio), cb->status); + if (!status) + status = errno_to_blk_status(btrfs_decompress_bio(cb)); - /* Finally free the cb struct */ - kfree(cb->compressed_pages); - kfree(cb); + btrfs_free_compressed_pages(cb); + btrfs_bio_end_io(cb->orig_bbio, status); bio_put(&bbio->bio); } @@ -172,14 +186,14 @@ static void end_compressed_bio_read(struct btrfs_bio *bbio) * Clear the writeback bits on all of the file * pages for a compressed write */ -static noinline void end_compressed_writeback(struct inode *inode, - const struct compressed_bio *cb) +static noinline void end_compressed_writeback(const struct compressed_bio *cb) { + struct inode *inode = &cb->bbio.inode->vfs_inode; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); unsigned long index = cb->start >> PAGE_SHIFT; unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; struct folio_batch fbatch; - const int errno = blk_status_to_errno(cb->status); + const int errno = blk_status_to_errno(cb->bbio.bio.bi_status); int i; int ret; @@ -207,45 +221,25 @@ static noinline void end_compressed_writeback(struct inode *inode, /* the inode may be gone now */ } -static void finish_compressed_bio_write(struct compressed_bio *cb) +static void btrfs_finish_compressed_write_work(struct work_struct *work) { - struct inode *inode = cb->inode; - unsigned int index; + struct compressed_bio *cb = + container_of(work, struct compressed_bio, write_end_work); /* * Ok, we're the last bio for this extent, step one is to call back * into the FS and do all the end_io operations. */ - btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL, + btrfs_writepage_endio_finish_ordered(cb->bbio.inode, NULL, cb->start, cb->start + cb->len - 1, - cb->status == BLK_STS_OK); + cb->bbio.bio.bi_status == BLK_STS_OK); if (cb->writeback) - end_compressed_writeback(inode, cb); + end_compressed_writeback(cb); /* Note, our inode could be gone now */ - /* - * Release the compressed pages, these came from alloc_page and - * are not attached to the inode at all - */ - for (index = 0; index < cb->nr_pages; index++) { - struct page *page = cb->compressed_pages[index]; - - page->mapping = NULL; - put_page(page); - } - - /* Finally free the cb struct */ - kfree(cb->compressed_pages); - kfree(cb); -} - -static void btrfs_finish_compressed_write_work(struct work_struct *work) -{ - struct compressed_bio *cb = - container_of(work, struct compressed_bio, write_end_work); - - finish_compressed_bio_write(cb); + btrfs_free_compressed_pages(cb); + bio_put(&cb->bbio.bio); } /* @@ -257,13 +251,25 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work) */ static void end_compressed_bio_write(struct btrfs_bio *bbio) { - struct compressed_bio *cb = bbio->private; - struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); + struct compressed_bio *cb = to_compressed_bio(bbio); + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; - cb->status = bbio->bio.bi_status; queue_work(fs_info->compressed_write_workers, &cb->write_end_work); +} - bio_put(&bbio->bio); +static void btrfs_add_compressed_bio_pages(struct compressed_bio *cb) +{ + struct bio *bio = &cb->bbio.bio; + u32 offset = 0; + + while (offset < cb->compressed_len) { + u32 len = min_t(u32, cb->compressed_len - offset, PAGE_SIZE); + + /* Maximum compressed extent is smaller than bio size limit. */ + __bio_add_page(bio, cb->compressed_pages[offset >> PAGE_SHIFT], + len, 0); + offset += len; + } } /* @@ -275,28 +281,24 @@ static void end_compressed_bio_write(struct btrfs_bio *bbio) * This also checksums the file bytes and gets things ready for * the end io hooks. */ -blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, +void btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, unsigned int len, u64 disk_start, unsigned int compressed_len, struct page **compressed_pages, unsigned int nr_pages, blk_opf_t write_flags, - struct cgroup_subsys_state *blkcg_css, bool writeback) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct bio *bio = NULL; struct compressed_bio *cb; - u64 cur_disk_bytenr = disk_start; - blk_status_t ret = BLK_STS_OK; ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(len, fs_info->sectorsize)); - cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS); - if (!cb) - return BLK_STS_RESOURCE; - cb->status = BLK_STS_OK; - cb->inode = &inode->vfs_inode; + + write_flags |= REQ_BTRFS_ONE_ORDERED; + + cb = alloc_compressed_bio(inode, start, REQ_OP_WRITE | write_flags, + end_compressed_bio_write); cb->start = start; cb->len = len; cb->compressed_pages = compressed_pages; @@ -304,56 +306,10 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, cb->writeback = writeback; INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work); cb->nr_pages = nr_pages; + cb->bbio.bio.bi_iter.bi_sector = disk_start >> SECTOR_SHIFT; + btrfs_add_compressed_bio_pages(cb); - if (blkcg_css) { - kthread_associate_blkcg(blkcg_css); - write_flags |= REQ_CGROUP_PUNT; - } - - write_flags |= REQ_BTRFS_ONE_ORDERED; - bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_WRITE | write_flags, - BTRFS_I(cb->inode), end_compressed_bio_write, cb); - bio->bi_iter.bi_sector = cur_disk_bytenr >> SECTOR_SHIFT; - btrfs_bio(bio)->file_offset = start; - - while (cur_disk_bytenr < disk_start + compressed_len) { - u64 offset = cur_disk_bytenr - disk_start; - unsigned int index = offset >> PAGE_SHIFT; - unsigned int real_size; - unsigned int added; - struct page *page = compressed_pages[index]; - - /* - * We have various limits on the real read size: - * - page boundary - * - compressed length boundary - */ - real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset)); - real_size = min_t(u64, real_size, compressed_len - offset); - ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); - - added = bio_add_page(bio, page, real_size, offset_in_page(offset)); - /* - * Maximum compressed extent is smaller than bio size limit, - * thus bio_add_page() should always success. - */ - ASSERT(added == real_size); - cur_disk_bytenr += added; - } - - /* Finished the range. */ - ASSERT(bio->bi_iter.bi_size); - btrfs_submit_bio(bio, 0); - if (blkcg_css) - kthread_associate_blkcg(NULL); - return ret; -} - -static u64 bio_end_offset(struct bio *bio) -{ - struct bio_vec *last = bio_last_bvec_all(bio); - - return page_offset(last->bv_page) + last->bv_len + last->bv_offset; + btrfs_submit_bio(&cb->bbio, 0); } /* @@ -374,7 +330,8 @@ static noinline int add_ra_bio_pages(struct inode *inode, { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); unsigned long end_index; - u64 cur = bio_end_offset(cb->orig_bio); + struct bio *orig_bio = &cb->orig_bbio->bio; + u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size; u64 isize = i_size_read(inode); int ret; struct page *page; @@ -464,7 +421,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, */ if (!em || cur < em->start || (cur + fs_info->sectorsize > extent_map_end(em)) || - (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) { + (em->block_start >> 9) != orig_bio->bi_iter.bi_sector) { free_extent_map(em); unlock_extent(tree, cur, page_end, NULL); unlock_page(page); @@ -484,7 +441,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, } add_size = min(em->start + em->len, page_end + 1) - cur; - ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur)); + ret = bio_add_page(orig_bio, page, add_size, offset_in_page(cur)); if (ret != add_size) { unlock_extent(tree, cur, page_end, NULL); unlock_page(page); @@ -515,17 +472,14 @@ static noinline int add_ra_bio_pages(struct inode *inode, * After the compressed pages are read, we copy the bytes into the * bio we were passed and then call the bio end_io calls */ -void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, - int mirror_num) +void btrfs_submit_compressed_read(struct btrfs_bio *bbio, int mirror_num) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_map_tree *em_tree; + struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_map_tree *em_tree = &inode->extent_tree; struct compressed_bio *cb; unsigned int compressed_len; - struct bio *comp_bio; - const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; - u64 cur_disk_byte = disk_bytenr; - u64 file_offset; + u64 file_offset = bbio->file_offset; u64 em_len; u64 em_start; struct extent_map *em; @@ -533,12 +487,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int memstall = 0; blk_status_t ret; int ret2; - int i; - - em_tree = &BTRFS_I(inode)->extent_tree; - - file_offset = bio_first_bvec_all(bio)->bv_offset + - page_offset(bio_first_page_all(bio)); /* we need the actual starting offset of this extent in the file */ read_lock(&em_tree->lock); @@ -551,102 +499,54 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ASSERT(em->compress_type != BTRFS_COMPRESS_NONE); compressed_len = em->block_len; - cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS); - if (!cb) { - ret = BLK_STS_RESOURCE; - goto out; - } - cb->status = BLK_STS_OK; - cb->inode = inode; + cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ, + end_compressed_bio_read); cb->start = em->orig_start; em_len = em->len; em_start = em->start; - cb->len = bio->bi_iter.bi_size; + cb->len = bbio->bio.bi_iter.bi_size; cb->compressed_len = compressed_len; cb->compress_type = em->compress_type; - cb->orig_bio = bio; + cb->orig_bbio = bbio; free_extent_map(em); - em = NULL; cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE); cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS); if (!cb->compressed_pages) { ret = BLK_STS_RESOURCE; - goto fail; + goto out_free_bio; } ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages); if (ret2) { ret = BLK_STS_RESOURCE; - goto fail; + goto out_free_compressed_pages; } - add_ra_bio_pages(inode, em_start + em_len, cb, &memstall, &pflags); + add_ra_bio_pages(&inode->vfs_inode, em_start + em_len, cb, &memstall, + &pflags); /* include any pages we added in add_ra-bio_pages */ - cb->len = bio->bi_iter.bi_size; - - comp_bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, BTRFS_I(cb->inode), - end_compressed_bio_read, cb); - comp_bio->bi_iter.bi_sector = (cur_disk_byte >> SECTOR_SHIFT); - - while (cur_disk_byte < disk_bytenr + compressed_len) { - u64 offset = cur_disk_byte - disk_bytenr; - unsigned int index = offset >> PAGE_SHIFT; - unsigned int real_size; - unsigned int added; - struct page *page = cb->compressed_pages[index]; - - /* - * We have various limit on the real read size: - * - page boundary - * - compressed length boundary - */ - real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset)); - real_size = min_t(u64, real_size, compressed_len - offset); - ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); - - added = bio_add_page(comp_bio, page, real_size, offset_in_page(offset)); - /* - * Maximum compressed extent is smaller than bio size limit, - * thus bio_add_page() should always success. - */ - ASSERT(added == real_size); - cur_disk_byte += added; - } + cb->len = bbio->bio.bi_iter.bi_size; + cb->bbio.bio.bi_iter.bi_sector = bbio->bio.bi_iter.bi_sector; + btrfs_add_compressed_bio_pages(cb); if (memstall) psi_memstall_leave(&pflags); - /* - * Stash the initial offset of this chunk, as there is no direct - * correlation between compressed pages and the original file offset. - * The field is only used for printing error messages anyway. - */ - btrfs_bio(comp_bio)->file_offset = file_offset; - - ASSERT(comp_bio->bi_iter.bi_size); - btrfs_submit_bio(comp_bio, mirror_num); + btrfs_submit_bio(&cb->bbio, mirror_num); return; -fail: - if (cb->compressed_pages) { - for (i = 0; i < cb->nr_pages; i++) { - if (cb->compressed_pages[i]) - __free_page(cb->compressed_pages[i]); - } - } - +out_free_compressed_pages: kfree(cb->compressed_pages); - kfree(cb); +out_free_bio: + bio_put(&cb->bbio.bio); out: - free_extent_map(em); - btrfs_bio_end_io(btrfs_bio(bio), ret); - return; + btrfs_bio_end_io(bbio, ret); } /* @@ -1038,6 +938,8 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) ret = compression_decompress_bio(workspace, cb); put_workspace(type, workspace); + if (!ret) + zero_fill_bio(&cb->orig_bbio->bio); return ret; } @@ -1062,6 +964,10 @@ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, int __init btrfs_init_compress(void) { + if (bioset_init(&btrfs_compressed_bioset, BIO_POOL_SIZE, + offsetof(struct compressed_bio, bbio.bio), + BIOSET_NEED_BVECS)) + return -ENOMEM; btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE); btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB); btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO); @@ -1075,6 +981,7 @@ void __cold btrfs_exit_compress(void) btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB); btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO); zstd_cleanup_workspace_manager(); + bioset_exit(&btrfs_compressed_bioset); } /* @@ -1110,7 +1017,7 @@ void __cold btrfs_exit_compress(void) int btrfs_decompress_buf2page(const char *buf, u32 buf_len, struct compressed_bio *cb, u32 decompressed) { - struct bio *orig_bio = cb->orig_bio; + struct bio *orig_bio = &cb->orig_bbio->bio; /* Offset inside the full decompressed extent */ u32 cur_offset; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index a5e3377db9ad..19ab2abeddc0 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -6,8 +6,8 @@ #ifndef BTRFS_COMPRESSION_H #define BTRFS_COMPRESSION_H -#include <linux/blk_types.h> #include <linux/sizes.h> +#include "bio.h" struct btrfs_inode; @@ -23,6 +23,7 @@ struct btrfs_inode; /* Maximum length of compressed data stored on disk */ #define BTRFS_MAX_COMPRESSED (SZ_128K) +#define BTRFS_MAX_COMPRESSED_PAGES (BTRFS_MAX_COMPRESSED / PAGE_SIZE) static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); /* Maximum size of data before compression */ @@ -37,9 +38,6 @@ struct compressed_bio { /* the pages with the compressed data on them */ struct page **compressed_pages; - /* inode that owns this data */ - struct inode *inode; - /* starting offset in the inode for our pages */ u64 start; @@ -55,14 +53,14 @@ struct compressed_bio { /* Whether this is a write for writeback. */ bool writeback; - /* IO errors */ - blk_status_t status; - union { /* For reads, this is the bio we are copying the data into */ - struct bio *orig_bio; + struct btrfs_bio *orig_bbio; struct work_struct write_end_work; }; + + /* Must be last. */ + struct btrfs_bio bbio; }; static inline unsigned int btrfs_compress_type(unsigned int type_level) @@ -88,16 +86,14 @@ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, int btrfs_decompress_buf2page(const char *buf, u32 buf_len, struct compressed_bio *cb, u32 decompressed); -blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, +void btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, unsigned int len, u64 disk_start, unsigned int compressed_len, struct page **compressed_pages, unsigned int nr_pages, blk_opf_t write_flags, - struct cgroup_subsys_state *blkcg_css, bool writeback); -void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, - int mirror_num); +void btrfs_submit_compressed_read(struct btrfs_bio *bbio, int mirror_num); unsigned int btrfs_compress_str2level(unsigned int type, const char *str); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a5b6bb54545f..3c983c70028a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -854,7 +854,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, * Search for a key in the given extent_buffer. * * The lower boundary for the search is specified by the slot number @first_slot. - * Use a value of 0 to search over the whole extent buffer. + * Use a value of 0 to search over the whole extent buffer. Works for both + * leaves and nodes. * * The slot in the extent buffer is returned via @slot. If the key exists in the * extent buffer, then @slot will point to the slot where the key is, otherwise @@ -863,8 +864,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, * Slot may point to the total number of items (i.e. one position beyond the last * key) if the key is bigger than the last key in the extent buffer. */ -int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot, - const struct btrfs_key *key, int *slot) +int btrfs_bin_search(struct extent_buffer *eb, int first_slot, + const struct btrfs_key *key, int *slot) { unsigned long p; int item_size; @@ -959,7 +960,7 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, if (slot < 0 || slot >= btrfs_header_nritems(parent)) return ERR_PTR(-ENOENT); - BUG_ON(level == 0); + ASSERT(level); check.level = level - 1; check.transid = btrfs_node_ptr_generation(parent, slot); @@ -1064,11 +1065,14 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4) return 0; - left = btrfs_read_node_slot(parent, pslot - 1); - if (IS_ERR(left)) - left = NULL; + if (pslot) { + left = btrfs_read_node_slot(parent, pslot - 1); + if (IS_ERR(left)) { + ret = PTR_ERR(left); + left = NULL; + goto enospc; + } - if (left) { __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); wret = btrfs_cow_block(trans, root, left, parent, pslot - 1, &left, @@ -1079,11 +1083,14 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } } - right = btrfs_read_node_slot(parent, pslot + 1); - if (IS_ERR(right)) - right = NULL; + if (pslot + 1 < btrfs_header_nritems(parent)) { + right = btrfs_read_node_slot(parent, pslot + 1); + if (IS_ERR(right)) { + ret = PTR_ERR(right); + right = NULL; + goto enospc; + } - if (right) { __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); wret = btrfs_cow_block(trans, root, right, parent, pslot + 1, &right, @@ -1240,14 +1247,14 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (!parent) return 1; - left = btrfs_read_node_slot(parent, pslot - 1); - if (IS_ERR(left)) - left = NULL; - /* first, try to make some room in the middle buffer */ - if (left) { + if (pslot) { u32 left_nr; + left = btrfs_read_node_slot(parent, pslot - 1); + if (IS_ERR(left)) + return PTR_ERR(left); + __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); left_nr = btrfs_header_nritems(left); @@ -1292,16 +1299,17 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_tree_unlock(left); free_extent_buffer(left); } - right = btrfs_read_node_slot(parent, pslot + 1); - if (IS_ERR(right)) - right = NULL; /* * then try to empty the right most buffer into the middle */ - if (right) { + if (pslot + 1 < btrfs_header_nritems(parent)) { u32 right_nr; + right = btrfs_read_node_slot(parent, pslot + 1); + if (IS_ERR(right)) + return PTR_ERR(right); + __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); right_nr = btrfs_header_nritems(right); @@ -1864,7 +1872,7 @@ static inline int search_for_key_slot(struct extent_buffer *eb, return 0; } - return btrfs_generic_bin_search(eb, search_low_slot, key, slot); + return btrfs_bin_search(eb, search_low_slot, key, slot); } static int search_leaf(struct btrfs_trans_handle *trans, @@ -2321,7 +2329,7 @@ again: */ btrfs_unlock_up_safe(p, level + 1); - ret = btrfs_bin_search(b, key, &slot); + ret = btrfs_bin_search(b, 0, key, &slot); if (ret < 0) goto done; @@ -2482,26 +2490,15 @@ int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *path) { - while (1) { + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { int ret; - const int slot = path->slots[0]; - const struct extent_buffer *leaf = path->nodes[0]; - /* This is where we start walking the path. */ - if (slot >= btrfs_header_nritems(leaf)) { - /* - * If we've reached the last slot in this leaf we need - * to go to the next leaf and reset the path. - */ - ret = btrfs_next_leaf(root, path); - if (ret) - return ret; - continue; - } - /* Store the found, valid item in @key. */ - btrfs_item_key_to_cpu(leaf, key, slot); - break; + ret = btrfs_next_leaf(root, path); + if (ret) + return ret; } + + btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); return 0; } @@ -3198,12 +3195,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_write_locked(path->nodes[1]); right = btrfs_read_node_slot(upper, slot + 1); - /* - * slot + 1 is not valid or we fail to read the right node, - * no big deal, just return. - */ if (IS_ERR(right)) - return 1; + return PTR_ERR(right); __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); @@ -3417,12 +3410,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_write_locked(path->nodes[1]); left = btrfs_read_node_slot(path->nodes[1], slot - 1); - /* - * slot - 1 is not valid or we fail to read the left node, - * no big deal, just return. - */ if (IS_ERR(left)) - return 1; + return PTR_ERR(left); __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); @@ -4576,7 +4565,7 @@ again: while (1) { nritems = btrfs_header_nritems(cur); level = btrfs_header_level(cur); - sret = btrfs_bin_search(cur, min_key, &slot); + sret = btrfs_bin_search(cur, 0, min_key, &slot); if (sret < 0) { ret = sret; goto out; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 97897107fab5..4c1986cd5bed 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -508,22 +508,9 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); int __init btrfs_ctree_init(void); void __cold btrfs_ctree_exit(void); -int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot, - const struct btrfs_key *key, int *slot); +int btrfs_bin_search(struct extent_buffer *eb, int first_slot, + const struct btrfs_key *key, int *slot); -/* - * Simple binary search on an extent buffer. Works for both leaves and nodes, and - * always searches over the whole range of keys (slot 0 to slot 'nritems - 1'). - */ -static inline int btrfs_bin_search(struct extent_buffer *eb, - const struct btrfs_key *key, - int *slot) -{ - return btrfs_generic_bin_search(eb, 0, key, slot); -} - -int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, - int *slot); int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 7ddb1d104e8e..427abaf608b8 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -358,8 +358,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, * racing with an ordered completion or some such that would think it * needs to free the reservation we just made. */ - spin_lock(&inode->lock); nr_extents = count_max_extents(fs_info, num_bytes); + spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, nr_extents); inode->csum_bytes += disk_num_bytes; btrfs_calculate_inode_block_rsv_size(fs_info, inode); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 886ffb232eac..0b32432d7d56 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -53,24 +53,6 @@ bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) return ret; } -int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) -{ - u64 num_entries = - atomic_read(&trans->transaction->delayed_refs.num_entries); - u64 avg_runtime; - u64 val; - - smp_mb(); - avg_runtime = trans->fs_info->avg_delayed_ref_runtime; - val = num_entries * avg_runtime; - if (val >= NSEC_PER_SEC) - return 1; - if (val >= NSEC_PER_SEC / 2) - return 2; - - return btrfs_check_space_for_delayed_refs(trans->fs_info); -} - /* * Release a ref head's reservation. * @@ -83,20 +65,9 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) { struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; - u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr); + const u64 num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr); u64 released = 0; - /* - * We have to check the mount option here because we could be enabling - * the free space tree for the first time and don't have the compat_ro - * option set yet. - * - * We need extra reservations if we have the free space tree because - * we'll have to modify that tree as well. - */ - if (btrfs_test_opt(fs_info, FREE_SPACE_TREE)) - num_bytes *= 2; - released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); if (released) trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", @@ -118,18 +89,8 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) if (!trans->delayed_ref_updates) return; - num_bytes = btrfs_calc_insert_metadata_size(fs_info, - trans->delayed_ref_updates); - /* - * We have to check the mount option here because we could be enabling - * the free space tree for the first time and don't have the compat_ro - * option set yet. - * - * We need extra reservations if we have the free space tree because - * we'll have to modify that tree as well. - */ - if (btrfs_test_opt(fs_info, FREE_SPACE_TREE)) - num_bytes *= 2; + num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, + trans->delayed_ref_updates); spin_lock(&delayed_rsv->lock); delayed_rsv->size += num_bytes; @@ -200,7 +161,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush) { struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; - u64 limit = btrfs_calc_insert_metadata_size(fs_info, 1); + u64 limit = btrfs_calc_delayed_ref_bytes(fs_info, 1); u64 num_bytes = 0; int ret = -ENOSPC; @@ -217,7 +178,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush); if (ret) return ret; - btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0); + btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false); trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, num_bytes, 1); return 0; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 2eb34abf700f..b54261fe509b 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -253,6 +253,27 @@ extern struct kmem_cache *btrfs_delayed_extent_op_cachep; int __init btrfs_delayed_ref_init(void); void __cold btrfs_delayed_ref_exit(void); +static inline u64 btrfs_calc_delayed_ref_bytes(const struct btrfs_fs_info *fs_info, + int num_delayed_refs) +{ + u64 num_bytes; + + num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_delayed_refs); + + /* + * We have to check the mount option here because we could be enabling + * the free space tree for the first time and don't have the compat_ro + * option set yet. + * + * We need extra reservations if we have the free space tree because + * we'll have to modify that tree as well. + */ + if (btrfs_test_opt(fs_info, FREE_SPACE_TREE)) + num_bytes *= 2; + + return num_bytes; +} + static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref, int action, u64 bytenr, u64 len, u64 parent) { @@ -385,7 +406,6 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *src, u64 num_bytes); -int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans); bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); /* diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9e1596bb208d..59ea049fe7ee 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1341,17 +1341,8 @@ struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) { int ret; - unsigned int nofs_flag; - /* - * We might be called under a transaction (e.g. indirect backref - * resolution) which could deadlock if it triggers memory reclaim - */ - nofs_flag = memalloc_nofs_save(); - ret = btrfs_drew_lock_init(&root->snapshot_lock); - memalloc_nofs_restore(nofs_flag); - if (ret) - goto fail; + btrfs_drew_lock_init(&root->snapshot_lock); if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && !btrfs_is_data_reloc_root(root)) { @@ -2065,7 +2056,6 @@ void btrfs_put_root(struct btrfs_root *root) WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)); if (root->anon_dev) free_anon_bdev(root->anon_dev); - btrfs_drew_lock_destroy(&root->snapshot_lock); free_root_extent_buffers(root); #ifdef CONFIG_BTRFS_DEBUG spin_lock(&root->fs_info->fs_roots_radix_lock); @@ -2125,11 +2115,16 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info) atomic_set(&fs_info->reloc_cancel_req, 0); } -static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) +static int btrfs_init_btree_inode(struct super_block *sb) { - struct inode *inode = fs_info->btree_inode; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); unsigned long hash = btrfs_inode_hash(BTRFS_BTREE_INODE_OBJECTID, fs_info->tree_root); + struct inode *inode; + + inode = new_inode(sb); + if (!inode) + return -ENOMEM; inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; set_nlink(inode, 1); @@ -2140,6 +2135,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) */ inode->i_size = OFFSET_MAX; inode->i_mapping->a_ops = &btree_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, @@ -2152,6 +2148,9 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) BTRFS_I(inode)->location.offset = 0; set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); __insert_inode_hash(inode, hash); + fs_info->btree_inode = inode; + + return 0; } static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info) @@ -2966,7 +2965,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) atomic64_set(&fs_info->free_chunk_space, 0); fs_info->tree_mod_log = RB_ROOT; fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; - fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */ btrfs_init_ref_verify(fs_info); fs_info->thread_pool_size = min_t(unsigned long, @@ -3344,14 +3342,11 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device struct btrfs_root *tree_root; struct btrfs_root *chunk_root; int ret; - int err = -EINVAL; int level; ret = init_mount_fs_info(fs_info, sb); - if (ret) { - err = ret; + if (ret) goto fail; - } /* These need to be init'ed before we start creating inodes and such. */ tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, @@ -3361,17 +3356,13 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device GFP_KERNEL); fs_info->chunk_root = chunk_root; if (!tree_root || !chunk_root) { - err = -ENOMEM; + ret = -ENOMEM; goto fail; } - fs_info->btree_inode = new_inode(sb); - if (!fs_info->btree_inode) { - err = -ENOMEM; + ret = btrfs_init_btree_inode(sb); + if (ret) goto fail; - } - mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); - btrfs_init_btree_inode(fs_info); invalidate_bdev(fs_devices->latest_dev->bdev); @@ -3380,7 +3371,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device */ disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev); if (IS_ERR(disk_super)) { - err = PTR_ERR(disk_super); + ret = PTR_ERR(disk_super); goto fail_alloc; } @@ -3392,7 +3383,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (!btrfs_supported_super_csum(csum_type)) { btrfs_err(fs_info, "unsupported checksum algorithm: %u", csum_type); - err = -EINVAL; + ret = -EINVAL; btrfs_release_disk_super(disk_super); goto fail_alloc; } @@ -3401,7 +3392,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device ret = btrfs_init_csum_hash(fs_info, csum_type); if (ret) { - err = ret; btrfs_release_disk_super(disk_super); goto fail_alloc; } @@ -3412,7 +3402,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device */ if (btrfs_check_super_csum(fs_info, disk_super)) { btrfs_err(fs_info, "superblock checksum mismatch"); - err = -EINVAL; + ret = -EINVAL; btrfs_release_disk_super(disk_super); goto fail_alloc; } @@ -3442,12 +3432,15 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device ret = btrfs_validate_mount_super(fs_info); if (ret) { btrfs_err(fs_info, "superblock contains fatal errors"); - err = -EINVAL; + ret = -EINVAL; goto fail_alloc; } - if (!btrfs_super_root(disk_super)) + if (!btrfs_super_root(disk_super)) { + btrfs_err(fs_info, "invalid superblock tree root bytenr"); + ret = -EINVAL; goto fail_alloc; + } /* check FS state, whether FS is broken. */ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) @@ -3474,16 +3467,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->stripesize = stripesize; ret = btrfs_parse_options(fs_info, options, sb->s_flags); - if (ret) { - err = ret; + if (ret) goto fail_alloc; - } ret = btrfs_check_features(fs_info, !sb_rdonly(sb)); - if (ret < 0) { - err = ret; + if (ret < 0) goto fail_alloc; - } if (sectorsize < PAGE_SIZE) { struct btrfs_subpage_info *subpage_info; @@ -3503,17 +3492,17 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device "read-write for sector size %u with page size %lu is experimental", sectorsize, PAGE_SIZE); subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL); - if (!subpage_info) + if (!subpage_info) { + ret = -ENOMEM; goto fail_alloc; + } btrfs_init_subpage_info(subpage_info, sectorsize); fs_info->subpage_info = subpage_info; } ret = btrfs_init_workqueues(fs_info); - if (ret) { - err = ret; + if (ret) goto fail_sb_buffer; - } sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super); sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE); @@ -3559,6 +3548,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device btrfs_free_extra_devids(fs_devices); if (!fs_devices->latest_dev->bdev) { btrfs_err(fs_info, "failed to read devices"); + ret = -EIO; goto fail_tree_roots; } @@ -3574,8 +3564,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device ret = btrfs_get_dev_zone_info_all_devices(fs_info); if (ret) { btrfs_err(fs_info, - "zoned: failed to read device zone info: %d", - ret); + "zoned: failed to read device zone info: %d", ret); goto fail_block_groups; } @@ -3654,19 +3643,24 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device !btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info, "writable mount is not allowed due to too many missing devices"); + ret = -EINVAL; goto fail_sysfs; } fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info, "btrfs-cleaner"); - if (IS_ERR(fs_info->cleaner_kthread)) + if (IS_ERR(fs_info->cleaner_kthread)) { + ret = PTR_ERR(fs_info->cleaner_kthread); goto fail_sysfs; + } fs_info->transaction_kthread = kthread_run(transaction_kthread, tree_root, "btrfs-transaction"); - if (IS_ERR(fs_info->transaction_kthread)) + if (IS_ERR(fs_info->transaction_kthread)) { + ret = PTR_ERR(fs_info->transaction_kthread); goto fail_cleaner; + } if (!btrfs_test_opt(fs_info, NOSSD) && !fs_info->fs_devices->rotating) { @@ -3684,7 +3678,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->fs_devices->discardable) { btrfs_set_and_info(fs_info, DISCARD_ASYNC, "auto enabling async discard"); - btrfs_clear_opt(fs_info->mount_opt, NODISCARD); } #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY @@ -3711,16 +3704,14 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device !btrfs_test_opt(fs_info, NOLOGREPLAY)) { btrfs_info(fs_info, "start tree-log replay"); ret = btrfs_replay_log(fs_info, fs_devices); - if (ret) { - err = ret; + if (ret) goto fail_qgroup; - } } fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true); if (IS_ERR(fs_info->fs_root)) { - err = PTR_ERR(fs_info->fs_root); - btrfs_warn(fs_info, "failed to read fs tree: %d", err); + ret = PTR_ERR(fs_info->fs_root); + btrfs_warn(fs_info, "failed to read fs tree: %d", ret); fs_info->fs_root = NULL; goto fail_qgroup; } @@ -3797,7 +3788,8 @@ fail_alloc: iput(fs_info->btree_inode); fail: btrfs_close_devices(fs_info->fs_devices); - return err; + ASSERT(ret < 0); + return ret; } ALLOW_ERROR_INJECTION(open_ctree, ERRNO); @@ -4094,6 +4086,8 @@ static void write_dev_flush(struct btrfs_device *device) { struct bio *bio = &device->flush_bio; + device->last_flush_error = BLK_STS_OK; + #ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY /* * When a disk has write caching disabled, we skip submission of a bio @@ -4122,25 +4116,24 @@ static void write_dev_flush(struct btrfs_device *device) /* * If the flush bio has been submitted by write_dev_flush, wait for it. + * Return true for any error, and false otherwise. */ -static blk_status_t wait_dev_flush(struct btrfs_device *device) +static bool wait_dev_flush(struct btrfs_device *device) { struct bio *bio = &device->flush_bio; - if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)) - return BLK_STS_OK; + if (!test_and_clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)) + return false; - clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state); wait_for_completion_io(&device->flush_wait); - return bio->bi_status; -} + if (bio->bi_status) { + device->last_flush_error = bio->bi_status; + btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS); + return true; + } -static int check_barrier_error(struct btrfs_fs_info *fs_info) -{ - if (!btrfs_check_rw_degradable(fs_info, NULL)) - return -EIO; - return 0; + return false; } /* @@ -4152,7 +4145,6 @@ static int barrier_all_devices(struct btrfs_fs_info *info) struct list_head *head; struct btrfs_device *dev; int errors_wait = 0; - blk_status_t ret; lockdep_assert_held(&info->fs_devices->device_list_mutex); /* send down all the barriers */ @@ -4167,7 +4159,6 @@ static int barrier_all_devices(struct btrfs_fs_info *info) continue; write_dev_flush(dev); - dev->last_flush_error = BLK_STS_OK; } /* wait for all the barriers */ @@ -4182,23 +4173,17 @@ static int barrier_all_devices(struct btrfs_fs_info *info) !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) continue; - ret = wait_dev_flush(dev); - if (ret) { - dev->last_flush_error = ret; - btrfs_dev_stat_inc_and_print(dev, - BTRFS_DEV_STAT_FLUSH_ERRS); + if (wait_dev_flush(dev)) errors_wait++; - } } - if (errors_wait) { - /* - * At some point we need the status of all disks - * to arrive at the volume status. So error checking - * is being pushed to a separate loop. - */ - return check_barrier_error(info); - } + /* + * Checks last_flush_error of disks in order to determine the device + * state. + */ + if (errors_wait && !btrfs_check_rw_degradable(info, NULL)) + return -EIO; + return 0; } @@ -4404,12 +4389,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) root_objectid = gang[i]->root_key.objectid; err = btrfs_orphan_cleanup(gang[i]); if (err) - break; + goto out; btrfs_put_root(gang[i]); } root_objectid++; } - +out: /* release the uncleaned roots due to error */ for (; i < ret; i++) { if (gang[i]) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 824c657f59e8..5cd289de4e92 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1894,8 +1894,7 @@ static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( } static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *locked_ref, - unsigned long *run_refs) + struct btrfs_delayed_ref_head *locked_ref) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs; @@ -1917,7 +1916,6 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, return -EAGAIN; } - (*run_refs)++; ref->in_tree = 0; rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree); RB_CLEAR_NODE(&ref->ref_node); @@ -1981,10 +1979,8 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_head *locked_ref = NULL; - ktime_t start = ktime_get(); int ret; unsigned long count = 0; - unsigned long actual_count = 0; delayed_refs = &trans->transaction->delayed_refs; do { @@ -2014,8 +2010,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, spin_lock(&locked_ref->lock); btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); - ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, - &actual_count); + ret = btrfs_run_delayed_refs_for_head(trans, locked_ref); if (ret < 0 && ret != -EAGAIN) { /* * Error, btrfs_run_delayed_refs_for_head already @@ -2046,24 +2041,6 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, cond_resched(); } while ((nr != -1 && count < nr) || locked_ref); - /* - * We don't want to include ref heads since we can have empty ref heads - * and those will drastically skew our runtime down since we just do - * accounting, no actual extent tree updates. - */ - if (actual_count > 0) { - u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); - u64 avg; - - /* - * We weigh the current average higher than our current runtime - * to avoid large swings in the average. - */ - spin_lock(&delayed_refs->lock); - avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; - fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */ - spin_unlock(&delayed_refs->lock); - } return 0; } @@ -5509,11 +5486,11 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, { int level = wc->level; int lookup_info = 1; - int ret; + int ret = 0; while (level >= 0) { ret = walk_down_proc(trans, root, path, wc, lookup_info); - if (ret > 0) + if (ret) break; if (level == 0) @@ -5528,10 +5505,10 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, path->slots[level]++; continue; } else if (ret < 0) - return ret; + break; level = wc->level; } - return 0; + return (ret == 1) ? 0 : ret; } static noinline int walk_up_tree(struct btrfs_trans_handle *trans, @@ -5708,12 +5685,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) ret = walk_down_tree(trans, root, path, wc); if (ret < 0) { + btrfs_abort_transaction(trans, ret); err = ret; break; } ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); if (ret < 0) { + btrfs_abort_transaction(trans, ret); err = ret; break; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 40300e8e5f99..a1adadd5d25d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -97,11 +97,13 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) * how many bytes are there before stripe/ordered extent boundary. */ struct btrfs_bio_ctrl { - struct bio *bio; + struct btrfs_bio *bbio; int mirror_num; enum btrfs_compression_type compress_type; u32 len_to_oe_boundary; + blk_opf_t opf; btrfs_bio_end_io_t end_io_func; + struct writeback_control *wbc; /* * This is for metadata read, to provide the extra needed verification @@ -117,51 +119,41 @@ struct btrfs_bio_ctrl { * does the unlocking. */ bool extent_locked; - - /* Tell the submit_bio code to use REQ_SYNC */ - bool sync_io; }; static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) { - struct bio *bio; - struct bio_vec *bv; - struct inode *inode; - int mirror_num; + struct btrfs_bio *bbio = bio_ctrl->bbio; + int mirror_num = bio_ctrl->mirror_num; - if (!bio_ctrl->bio) + if (!bbio) return; - bio = bio_ctrl->bio; - bv = bio_first_bvec_all(bio); - inode = bv->bv_page->mapping->host; - mirror_num = bio_ctrl->mirror_num; - /* Caller should ensure the bio has at least some range added */ - ASSERT(bio->bi_iter.bi_size); + ASSERT(bbio->bio.bi_iter.bi_size); - if (!is_data_inode(inode)) { - if (btrfs_op(bio) != BTRFS_MAP_WRITE) { + if (!is_data_inode(&bbio->inode->vfs_inode)) { + if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) { /* * For metadata read, we should have the parent_check, * and copy it to bbio for metadata verification. */ ASSERT(bio_ctrl->parent_check); - memcpy(&btrfs_bio(bio)->parent_check, + memcpy(&bbio->parent_check, bio_ctrl->parent_check, sizeof(struct btrfs_tree_parent_check)); } - bio->bi_opf |= REQ_META; + bbio->bio.bi_opf |= REQ_META; } - if (btrfs_op(bio) == BTRFS_MAP_READ && + if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ && bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) - btrfs_submit_compressed_read(inode, bio, mirror_num); + btrfs_submit_compressed_read(bbio, mirror_num); else - btrfs_submit_bio(bio, mirror_num); + btrfs_submit_bio(bbio, mirror_num); - /* The bio is owned by the end_io handler now */ - bio_ctrl->bio = NULL; + /* The bbio is owned by the end_io handler now */ + bio_ctrl->bbio = NULL; } /* @@ -169,16 +161,16 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) */ static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret) { - struct bio *bio = bio_ctrl->bio; + struct btrfs_bio *bbio = bio_ctrl->bbio; - if (!bio) + if (!bbio) return; if (ret) { ASSERT(ret < 0); - btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); + btrfs_bio_end_io(bbio, errno_to_blk_status(ret)); /* The bio is owned by the end_io handler now */ - bio_ctrl->bio = NULL; + bio_ctrl->bbio = NULL; } else { submit_one_bio(bio_ctrl); } @@ -867,89 +859,52 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array) return 0; } -/* - * Attempt to add a page to bio. - * - * @bio_ctrl: record both the bio, and its bio_flags - * @page: page to add to the bio - * @disk_bytenr: offset of the new bio or to check whether we are adding - * a contiguous page to the previous one - * @size: portion of page that we want to write - * @pg_offset: starting offset in the page - * @compress_type: compression type of the current bio to see if we can merge them - * - * Attempt to add a page to bio considering stripe alignment etc. - * - * Return >= 0 for the number of bytes added to the bio. - * Can return 0 if the current bio is already at stripe/zone boundary. - * Return <0 for error. - */ -static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, - struct page *page, - u64 disk_bytenr, unsigned int size, - unsigned int pg_offset, - enum btrfs_compression_type compress_type) +static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, + struct page *page, u64 disk_bytenr, + unsigned int pg_offset) { - struct bio *bio = bio_ctrl->bio; - u32 bio_size = bio->bi_iter.bi_size; - u32 real_size; + struct bio *bio = &bio_ctrl->bbio->bio; + struct bio_vec *bvec = bio_last_bvec_all(bio); const sector_t sector = disk_bytenr >> SECTOR_SHIFT; - bool contig = false; - ASSERT(bio); - /* The limit should be calculated when bio_ctrl->bio is allocated */ - ASSERT(bio_ctrl->len_to_oe_boundary); - if (bio_ctrl->compress_type != compress_type) - return 0; - - - if (bio->bi_iter.bi_size == 0) { - /* We can always add a page into an empty bio. */ - contig = true; - } else if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE) { - struct bio_vec *bvec = bio_last_bvec_all(bio); - - /* - * The contig check requires the following conditions to be met: - * 1) The pages are belonging to the same inode - * This is implied by the call chain. - * - * 2) The range has adjacent logical bytenr - * - * 3) The range has adjacent file offset - * This is required for the usage of btrfs_bio->file_offset. - */ - if (bio_end_sector(bio) == sector && - page_offset(bvec->bv_page) + bvec->bv_offset + - bvec->bv_len == page_offset(page) + pg_offset) - contig = true; - } else { + if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { /* - * For compression, all IO should have its logical bytenr - * set to the starting bytenr of the compressed extent. + * For compression, all IO should have its logical bytenr set + * to the starting bytenr of the compressed extent. */ - contig = bio->bi_iter.bi_sector == sector; + return bio->bi_iter.bi_sector == sector; } - if (!contig) - return 0; - - real_size = min(bio_ctrl->len_to_oe_boundary - bio_size, size); - /* - * If real_size is 0, never call bio_add_*_page(), as even size is 0, - * bio will still execute its endio function on the page! + * The contig check requires the following conditions to be met: + * + * 1) The pages are belonging to the same inode + * This is implied by the call chain. + * + * 2) The range has adjacent logical bytenr + * + * 3) The range has adjacent file offset + * This is required for the usage of btrfs_bio->file_offset. */ - if (real_size == 0) - return 0; - - return bio_add_page(bio, page, real_size, pg_offset); + return bio_end_sector(bio) == sector && + page_offset(bvec->bv_page) + bvec->bv_offset + bvec->bv_len == + page_offset(page) + pg_offset; } -static void calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, - struct btrfs_inode *inode, u64 file_offset) +static void alloc_new_bio(struct btrfs_inode *inode, + struct btrfs_bio_ctrl *bio_ctrl, + u64 disk_bytenr, u64 file_offset) { - struct btrfs_ordered_extent *ordered; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_bio *bbio; + + bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, fs_info, + bio_ctrl->end_io_func, NULL); + bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; + bbio->inode = inode; + bbio->file_offset = file_offset; + bio_ctrl->bbio = bbio; + bio_ctrl->len_to_oe_boundary = U32_MAX; /* * Limit the extent to the ordered boundary for Zone Append. @@ -957,132 +912,89 @@ static void calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, * them. */ if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE && - btrfs_use_zone_append(btrfs_bio(bio_ctrl->bio))) { + btrfs_use_zone_append(bbio)) { + struct btrfs_ordered_extent *ordered; + ordered = btrfs_lookup_ordered_extent(inode, file_offset); if (ordered) { bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, ordered->file_offset + ordered->disk_num_bytes - file_offset); btrfs_put_ordered_extent(ordered); - return; } } - bio_ctrl->len_to_oe_boundary = U32_MAX; -} - -static void alloc_new_bio(struct btrfs_inode *inode, - struct btrfs_bio_ctrl *bio_ctrl, - struct writeback_control *wbc, blk_opf_t opf, - u64 disk_bytenr, u32 offset, u64 file_offset, - enum btrfs_compression_type compress_type) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct bio *bio; - - bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, inode, bio_ctrl->end_io_func, - NULL); - /* - * For compressed page range, its disk_bytenr is always @disk_bytenr - * passed in, no matter if we have added any range into previous bio. - */ - if (compress_type != BTRFS_COMPRESS_NONE) - bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; - else - bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT; - btrfs_bio(bio)->file_offset = file_offset; - bio_ctrl->bio = bio; - bio_ctrl->compress_type = compress_type; - calc_bio_boundaries(bio_ctrl, inode, file_offset); - - if (wbc) { + if (bio_ctrl->wbc) { /* * Pick the last added device to support cgroup writeback. For * multi-device file systems this means blk-cgroup policies have * to always be set on the last added/replaced device. * This is a bit odd but has been like that for a long time. */ - bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev); - wbc_init_bio(wbc, bio); + bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev); + wbc_init_bio(bio_ctrl->wbc, &bbio->bio); } } /* - * @opf: bio REQ_OP_* and REQ_* flags as one value - * @wbc: optional writeback control for io accounting * @disk_bytenr: logical bytenr where the write will be * @page: page to add to the bio * @size: portion of page that we want to write to * @pg_offset: offset of the new bio or to check whether we are adding * a contiguous page to the previous one - * @compress_type: compress type for current bio * - * The will either add the page into the existing @bio_ctrl->bio, or allocate a - * new one in @bio_ctrl->bio. + * The will either add the page into the existing @bio_ctrl->bbio, or allocate a + * new one in @bio_ctrl->bbio. * The mirror number for this IO should already be initizlied in * @bio_ctrl->mirror_num. */ -static int submit_extent_page(blk_opf_t opf, - struct writeback_control *wbc, - struct btrfs_bio_ctrl *bio_ctrl, - u64 disk_bytenr, struct page *page, - size_t size, unsigned long pg_offset, - enum btrfs_compression_type compress_type, - bool force_bio_submit) +static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, + u64 disk_bytenr, struct page *page, + size_t size, unsigned long pg_offset) { struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - unsigned int cur = pg_offset; - - ASSERT(bio_ctrl); - - ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE && - pg_offset + size <= PAGE_SIZE); + ASSERT(pg_offset + size <= PAGE_SIZE); ASSERT(bio_ctrl->end_io_func); - if (force_bio_submit) + if (bio_ctrl->bbio && + !btrfs_bio_is_contig(bio_ctrl, page, disk_bytenr, pg_offset)) submit_one_bio(bio_ctrl); - while (cur < pg_offset + size) { - u32 offset = cur - pg_offset; - int added; + do { + u32 len = size; /* Allocate new bio if needed */ - if (!bio_ctrl->bio) { - alloc_new_bio(inode, bio_ctrl, wbc, opf, disk_bytenr, - offset, page_offset(page) + cur, - compress_type); + if (!bio_ctrl->bbio) { + alloc_new_bio(inode, bio_ctrl, disk_bytenr, + page_offset(page) + pg_offset); } - /* - * We must go through btrfs_bio_add_page() to ensure each - * page range won't cross various boundaries. - */ - if (compress_type != BTRFS_COMPRESS_NONE) - added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, - size - offset, pg_offset + offset, - compress_type); - else - added = btrfs_bio_add_page(bio_ctrl, page, - disk_bytenr + offset, size - offset, - pg_offset + offset, compress_type); - - /* Metadata page range should never be split */ - if (!is_data_inode(&inode->vfs_inode)) - ASSERT(added == 0 || added == size - offset); - - /* At least we added some page, update the account */ - if (wbc && added) - wbc_account_cgroup_owner(wbc, page, added); - - /* We have reached boundary, submit right now */ - if (added < size - offset) { - /* The bio should contain some page(s) */ - ASSERT(bio_ctrl->bio->bi_iter.bi_size); + + /* Cap to the current ordered extent boundary if there is one. */ + if (len > bio_ctrl->len_to_oe_boundary) { + ASSERT(bio_ctrl->compress_type == BTRFS_COMPRESS_NONE); + ASSERT(is_data_inode(&inode->vfs_inode)); + len = bio_ctrl->len_to_oe_boundary; + } + + if (bio_add_page(&bio_ctrl->bbio->bio, page, len, pg_offset) != len) { + /* bio full: move on to a new one */ submit_one_bio(bio_ctrl); + continue; } - cur += added; - } - return 0; + + if (bio_ctrl->wbc) + wbc_account_cgroup_owner(bio_ctrl->wbc, page, len); + + size -= len; + pg_offset += len; + disk_bytenr += len; + bio_ctrl->len_to_oe_boundary -= len; + + /* Ordered extent boundary: move on to a new bio. */ + if (bio_ctrl->len_to_oe_boundary == 0) + submit_one_bio(bio_ctrl); + } while (size); } static int attach_extent_buffer_page(struct extent_buffer *eb, @@ -1193,8 +1105,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, * return 0 on success, otherwise return error */ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, - struct btrfs_bio_ctrl *bio_ctrl, - blk_opf_t read_flags, u64 *prev_em_start) + struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start) { struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -1216,7 +1127,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, unlock_extent(tree, start, end, NULL); btrfs_page_set_error(fs_info, page, start, PAGE_SIZE); unlock_page(page); - goto out; + return ret; } if (page->index == last_byte >> PAGE_SHIFT) { @@ -1230,7 +1141,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, bio_ctrl->end_io_func = end_bio_extent_readpage; begin_page_read(fs_info, page); while (cur <= end) { - unsigned long this_bio_flag = 0; + enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE; bool force_bio_submit = false; u64 disk_bytenr; @@ -1247,19 +1158,18 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, if (IS_ERR(em)) { unlock_extent(tree, cur, end, NULL); end_page_read(page, false, cur, end + 1 - cur); - ret = PTR_ERR(em); - break; + return PTR_ERR(em); } extent_offset = cur - em->start; BUG_ON(extent_map_end(em) <= cur); BUG_ON(end < cur); if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) - this_bio_flag = em->compress_type; + compress_type = em->compress_type; iosize = min(extent_map_end(em) - cur, end - cur + 1); iosize = ALIGN(iosize, blocksize); - if (this_bio_flag != BTRFS_COMPRESS_NONE) + if (compress_type != BTRFS_COMPRESS_NONE) disk_bytenr = em->block_start; else disk_bytenr = em->block_start + extent_offset; @@ -1331,24 +1241,20 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, continue; } - ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, - bio_ctrl, disk_bytenr, page, iosize, - pg_offset, this_bio_flag, - force_bio_submit); - if (ret) { - /* - * We have to unlock the remaining range, or the page - * will never be unlocked. - */ - unlock_extent(tree, cur, end, NULL); - end_page_read(page, false, cur, end + 1 - cur); - goto out; + if (bio_ctrl->compress_type != compress_type) { + submit_one_bio(bio_ctrl); + bio_ctrl->compress_type = compress_type; } + + if (force_bio_submit) + submit_one_bio(bio_ctrl); + submit_extent_page(bio_ctrl, disk_bytenr, page, iosize, + pg_offset); cur = cur + iosize; pg_offset += iosize; } -out: - return ret; + + return 0; } int btrfs_read_folio(struct file *file, struct folio *folio) @@ -1357,12 +1263,12 @@ int btrfs_read_folio(struct file *file, struct folio *folio) struct btrfs_inode *inode = BTRFS_I(page->mapping->host); u64 start = page_offset(page); u64 end = start + PAGE_SIZE - 1; - struct btrfs_bio_ctrl bio_ctrl = { 0 }; + struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ }; int ret; btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL); + ret = btrfs_do_readpage(page, NULL, &bio_ctrl, NULL); /* * If btrfs_do_readpage() failed we will want to submit the assembled * bio to do the cleanup. @@ -1384,7 +1290,7 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, for (index = 0; index < nr_pages; index++) { btrfs_do_readpage(pages[index], em_cached, bio_ctrl, - REQ_RAHEAD, prev_em_start); + prev_em_start); put_page(pages[index]); } } @@ -1520,7 +1426,6 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, */ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, struct page *page, - struct writeback_control *wbc, struct btrfs_bio_ctrl *bio_ctrl, loff_t i_size, int *nr_ret) @@ -1531,18 +1436,14 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, u64 extent_offset; u64 block_start; struct extent_map *em; - int saved_ret = 0; int ret = 0; int nr = 0; - enum req_op op = REQ_OP_WRITE; - const blk_opf_t write_flags = wbc_to_write_flags(wbc); - bool has_error = false; bool compressed; ret = btrfs_writepage_cow_fixup(page); if (ret) { /* Fixup worker will requeue */ - redirty_page_for_writepage(wbc, page); + redirty_page_for_writepage(bio_ctrl->wbc, page); unlock_page(page); return 1; } @@ -1551,7 +1452,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * we don't want to touch the inode after unlocking the page, * so we update the mapping writeback index now */ - wbc->nr_to_write--; + bio_ctrl->wbc->nr_to_write--; bio_ctrl->end_io_func = end_bio_extent_writepage; while (cur <= end) { @@ -1587,10 +1488,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, if (IS_ERR(em)) { btrfs_page_set_error(fs_info, page, cur, end - cur + 1); ret = PTR_ERR_OR_ZERO(em); - has_error = true; - if (!saved_ret) - saved_ret = ret; - break; + goto out_error; } extent_offset = cur - em->start; @@ -1642,33 +1540,21 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, */ btrfs_page_clear_dirty(fs_info, page, cur, iosize); - ret = submit_extent_page(op | write_flags, wbc, - bio_ctrl, disk_bytenr, - page, iosize, - cur - page_offset(page), - 0, false); - if (ret) { - has_error = true; - if (!saved_ret) - saved_ret = ret; - - btrfs_page_set_error(fs_info, page, cur, iosize); - if (PageWriteback(page)) - btrfs_page_clear_writeback(fs_info, page, cur, - iosize); - } - + submit_extent_page(bio_ctrl, disk_bytenr, page, iosize, + cur - page_offset(page)); cur += iosize; nr++; } + + btrfs_page_assert_not_dirty(fs_info, page); + *nr_ret = nr; + return 0; + +out_error: /* * If we finish without problem, we should not only clear page dirty, * but also empty subpage dirty bits */ - if (!has_error) - btrfs_page_assert_not_dirty(fs_info, page); - else - ret = saved_ret; *nr_ret = nr; return ret; } @@ -1682,8 +1568,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * Return 0 if everything goes well. * Return <0 for error. */ -static int __extent_writepage(struct page *page, struct writeback_control *wbc, - struct btrfs_bio_ctrl *bio_ctrl) +static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl) { struct folio *folio = page_folio(page); struct inode *inode = page->mapping->host; @@ -1696,7 +1581,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, loff_t i_size = i_size_read(inode); unsigned long end_index = i_size >> PAGE_SHIFT; - trace___extent_writepage(page, inode, wbc); + trace___extent_writepage(page, inode, bio_ctrl->wbc); WARN_ON(!PageLocked(page)); @@ -1721,15 +1606,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, } if (!bio_ctrl->extent_locked) { - ret = writepage_delalloc(BTRFS_I(inode), page, wbc); + ret = writepage_delalloc(BTRFS_I(inode), page, bio_ctrl->wbc); if (ret == 1) return 0; if (ret) goto done; } - ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, bio_ctrl, i_size, - &nr); + ret = __extent_writepage_io(BTRFS_I(inode), page, bio_ctrl, i_size, &nr); if (ret == 1) return 0; @@ -1773,6 +1657,8 @@ done: if (PageError(page)) end_extent_writepage(page, ret, page_start, page_end); if (bio_ctrl->extent_locked) { + struct writeback_control *wbc = bio_ctrl->wbc; + /* * If bio_ctrl->extent_locked, it's from extent_write_locked_range(), * the page can either be locked by lock_page() or @@ -1828,7 +1714,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { btrfs_tree_unlock(eb); - if (!bio_ctrl->sync_io) + if (bio_ctrl->wbc->sync_mode != WB_SYNC_ALL) return 0; if (!flush) { submit_write_bio(bio_ctrl, 0); @@ -2113,15 +1999,12 @@ static void prepare_eb_write(struct extent_buffer *eb) * Unlike the work in write_one_eb(), we rely completely on extent locking. * Page locking is only utilized at minimum to keep the VMM code happy. */ -static int write_one_subpage_eb(struct extent_buffer *eb, - struct writeback_control *wbc, - struct btrfs_bio_ctrl *bio_ctrl) +static void write_one_subpage_eb(struct extent_buffer *eb, + struct btrfs_bio_ctrl *bio_ctrl) { struct btrfs_fs_info *fs_info = eb->fs_info; struct page *page = eb->pages[0]; - blk_opf_t write_flags = wbc_to_write_flags(wbc); bool no_dirty_ebs = false; - int ret; prepare_eb_write(eb); @@ -2137,36 +2020,22 @@ static int write_one_subpage_eb(struct extent_buffer *eb, bio_ctrl->end_io_func = end_bio_subpage_eb_writepage; - ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - bio_ctrl, eb->start, page, eb->len, - eb->start - page_offset(page), 0, false); - if (ret) { - btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len); - set_btree_ioerr(page, eb); - unlock_page(page); - - if (atomic_dec_and_test(&eb->io_pages)) - end_extent_buffer_writeback(eb); - return -EIO; - } + submit_extent_page(bio_ctrl, eb->start, page, eb->len, + eb->start - page_offset(page)); unlock_page(page); /* * Submission finished without problem, if no range of the page is * dirty anymore, we have submitted a page. Update nr_written in wbc. */ if (no_dirty_ebs) - wbc->nr_to_write--; - return ret; + bio_ctrl->wbc->nr_to_write--; } -static noinline_for_stack int write_one_eb(struct extent_buffer *eb, - struct writeback_control *wbc, +static noinline_for_stack void write_one_eb(struct extent_buffer *eb, struct btrfs_bio_ctrl *bio_ctrl) { u64 disk_bytenr = eb->start; int i, num_pages; - blk_opf_t write_flags = wbc_to_write_flags(wbc); - int ret = 0; prepare_eb_write(eb); @@ -2178,32 +2047,11 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); - ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - bio_ctrl, disk_bytenr, p, - PAGE_SIZE, 0, 0, false); - if (ret) { - set_btree_ioerr(p, eb); - if (PageWriteback(p)) - end_page_writeback(p); - if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) - end_extent_buffer_writeback(eb); - ret = -EIO; - break; - } + submit_extent_page(bio_ctrl, disk_bytenr, p, PAGE_SIZE, 0); disk_bytenr += PAGE_SIZE; - wbc->nr_to_write--; + bio_ctrl->wbc->nr_to_write--; unlock_page(p); } - - if (unlikely(ret)) { - for (; i < num_pages; i++) { - struct page *p = eb->pages[i]; - clear_page_dirty_for_io(p); - unlock_page(p); - } - } - - return ret; } /* @@ -2220,9 +2068,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, * Return >=0 for the number of submitted extent buffers. * Return <0 for fatal error. */ -static int submit_eb_subpage(struct page *page, - struct writeback_control *wbc, - struct btrfs_bio_ctrl *bio_ctrl) +static int submit_eb_subpage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); int submitted = 0; @@ -2284,10 +2130,8 @@ static int submit_eb_subpage(struct page *page, free_extent_buffer(eb); goto cleanup; } - ret = write_one_subpage_eb(eb, wbc, bio_ctrl); + write_one_subpage_eb(eb, bio_ctrl); free_extent_buffer(eb); - if (ret < 0) - goto cleanup; submitted++; } return submitted; @@ -2318,8 +2162,7 @@ cleanup: * previous call. * Return <0 for fatal error. */ -static int submit_eb_page(struct page *page, struct writeback_control *wbc, - struct btrfs_bio_ctrl *bio_ctrl, +static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl, struct extent_buffer **eb_context) { struct address_space *mapping = page->mapping; @@ -2331,7 +2174,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, return 0; if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) - return submit_eb_subpage(page, wbc, bio_ctrl); + return submit_eb_subpage(page, bio_ctrl); spin_lock(&mapping->private_lock); if (!PagePrivate(page)) { @@ -2364,7 +2207,8 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, * If for_sync, this hole will be filled with * trasnsaction commit. */ - if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) + if (bio_ctrl->wbc->sync_mode == WB_SYNC_ALL && + !bio_ctrl->wbc->for_sync) ret = -EAGAIN; else ret = 0; @@ -2389,10 +2233,8 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, btrfs_schedule_zone_finish_bg(cache, eb); btrfs_put_block_group(cache); } - ret = write_one_eb(eb, wbc, bio_ctrl); + write_one_eb(eb, bio_ctrl); free_extent_buffer(eb); - if (ret < 0) - return ret; return 1; } @@ -2401,8 +2243,9 @@ int btree_write_cache_pages(struct address_space *mapping, { struct extent_buffer *eb_context = NULL; struct btrfs_bio_ctrl bio_ctrl = { + .wbc = wbc, + .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), .extent_locked = 0, - .sync_io = (wbc->sync_mode == WB_SYNC_ALL), }; struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; int ret = 0; @@ -2445,8 +2288,7 @@ retry: for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; - ret = submit_eb_page(&folio->page, wbc, &bio_ctrl, - &eb_context); + ret = submit_eb_page(&folio->page, &bio_ctrl, &eb_context); if (ret == 0) continue; if (ret < 0) { @@ -2529,9 +2371,9 @@ retry: * existing IO to complete. */ static int extent_write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc, struct btrfs_bio_ctrl *bio_ctrl) { + struct writeback_control *wbc = bio_ctrl->wbc; struct inode *inode = mapping->host; int ret = 0; int done = 0; @@ -2632,7 +2474,7 @@ retry: continue; } - ret = __extent_writepage(&folio->page, wbc, bio_ctrl); + ret = __extent_writepage(&folio->page, bio_ctrl); if (ret < 0) { done = 1; break; @@ -2688,18 +2530,19 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) u64 cur = start; unsigned long nr_pages; const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize; - struct btrfs_bio_ctrl bio_ctrl = { - .extent_locked = 1, - .sync_io = 1, - }; struct writeback_control wbc_writepages = { .sync_mode = WB_SYNC_ALL, .range_start = start, .range_end = end + 1, - /* We're called from an async helper function */ - .punt_to_cgroup = 1, .no_cgroup_owner = 1, }; + struct btrfs_bio_ctrl bio_ctrl = { + .wbc = &wbc_writepages, + /* We're called from an async helper function */ + .opf = REQ_OP_WRITE | REQ_BTRFS_CGROUP_PUNT | + wbc_to_write_flags(&wbc_writepages), + .extent_locked = 1, + }; ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize)); nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >> @@ -2719,7 +2562,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) ASSERT(PageLocked(page)); ASSERT(PageDirty(page)); clear_page_dirty_for_io(page); - ret = __extent_writepage(page, &wbc_writepages, &bio_ctrl); + ret = __extent_writepage(page, &bio_ctrl); ASSERT(ret <= 0); if (ret < 0) { found_error = true; @@ -2743,8 +2586,9 @@ int extent_writepages(struct address_space *mapping, struct inode *inode = mapping->host; int ret = 0; struct btrfs_bio_ctrl bio_ctrl = { + .wbc = wbc, + .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), .extent_locked = 0, - .sync_io = (wbc->sync_mode == WB_SYNC_ALL), }; /* @@ -2752,7 +2596,7 @@ int extent_writepages(struct address_space *mapping, * protect the write pointer updates. */ btrfs_zoned_data_reloc_lock(BTRFS_I(inode)); - ret = extent_write_cache_pages(mapping, wbc, &bio_ctrl); + ret = extent_write_cache_pages(mapping, &bio_ctrl); submit_write_bio(&bio_ctrl, ret); btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); return ret; @@ -2760,7 +2604,7 @@ int extent_writepages(struct address_space *mapping, void extent_readahead(struct readahead_control *rac) { - struct btrfs_bio_ctrl bio_ctrl = { 0 }; + struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD }; struct page *pagepool[16]; struct extent_map *em_cached = NULL; u64 prev_em_start = (u64)-1; @@ -4407,10 +4251,11 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, struct page *page = eb->pages[0]; struct extent_state *cached_state = NULL; struct btrfs_bio_ctrl bio_ctrl = { + .opf = REQ_OP_READ, .mirror_num = mirror_num, .parent_check = check, }; - int ret = 0; + int ret; ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)); ASSERT(PagePrivate(page)); @@ -4428,14 +4273,13 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, return ret; } - ret = 0; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) || PageUptodate(page) || btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) { set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, &cached_state); - return ret; + return 0; } clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); @@ -4447,28 +4291,19 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len); btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len); - ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl, - eb->start, page, eb->len, - eb->start - page_offset(page), 0, true); - if (ret) { - /* - * In the endio function, if we hit something wrong we will - * increase the io_pages, so here we need to decrease it for - * error path. - */ - atomic_dec(&eb->io_pages); - } + submit_extent_page(&bio_ctrl, eb->start, page, eb->len, + eb->start - page_offset(page)); submit_one_bio(&bio_ctrl); - if (ret || wait != WAIT_COMPLETE) { + if (wait != WAIT_COMPLETE) { free_extent_state(cached_state); - return ret; + return 0; } wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED, &cached_state); if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) - ret = -EIO; - return ret; + return -EIO; + return 0; } int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, @@ -4476,13 +4311,12 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, { int i; struct page *page; - int err; - int ret = 0; int locked_pages = 0; int all_uptodate = 1; int num_pages; unsigned long num_reads = 0; struct btrfs_bio_ctrl bio_ctrl = { + .opf = REQ_OP_READ, .mirror_num = mirror_num, .parent_check = check, }; @@ -4550,27 +4384,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, page = eb->pages[i]; if (!PageUptodate(page)) { - if (ret) { - atomic_dec(&eb->io_pages); - unlock_page(page); - continue; - } - ClearPageError(page); - err = submit_extent_page(REQ_OP_READ, NULL, - &bio_ctrl, page_offset(page), page, - PAGE_SIZE, 0, 0, false); - if (err) { - /* - * We failed to submit the bio so it's the - * caller's responsibility to perform cleanup - * i.e unlock page/set error bit. - */ - ret = err; - SetPageError(page); - unlock_page(page); - atomic_dec(&eb->io_pages); - } + submit_extent_page(&bio_ctrl, page_offset(page), page, + PAGE_SIZE, 0); } else { unlock_page(page); } @@ -4578,17 +4394,17 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, submit_one_bio(&bio_ctrl); - if (ret || wait != WAIT_COMPLETE) - return ret; + if (wait != WAIT_COMPLETE) + return 0; for (i = 0; i < num_pages; i++) { page = eb->pages[i]; wait_on_page_locked(page); if (!PageUptodate(page)) - ret = -EIO; + return -EIO; } - return ret; + return 0; unlock_exit: while (locked_pages > 0) { @@ -4596,7 +4412,7 @@ unlock_exit: page = eb->pages[locked_pages]; unlock_page(page); } - return ret; + return 0; } static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 41c77a100853..018c711a0bc8 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -336,48 +336,6 @@ out: } /* - * Locate the file_offset of @cur_disk_bytenr of a @bio. - * - * Bio of btrfs represents read range of - * [bi_sector << 9, bi_sector << 9 + bi_size). - * Knowing this, we can iterate through each bvec to locate the page belong to - * @cur_disk_bytenr and get the file offset. - * - * @inode is used to determine if the bvec page really belongs to @inode. - * - * Return 0 if we can't find the file offset - * Return >0 if we find the file offset and restore it to @file_offset_ret - */ -static int search_file_offset_in_bio(struct bio *bio, struct inode *inode, - u64 disk_bytenr, u64 *file_offset_ret) -{ - struct bvec_iter iter; - struct bio_vec bvec; - u64 cur = bio->bi_iter.bi_sector << SECTOR_SHIFT; - int ret = 0; - - bio_for_each_segment(bvec, bio, iter) { - struct page *page = bvec.bv_page; - - if (cur > disk_bytenr) - break; - if (cur + bvec.bv_len <= disk_bytenr) { - cur += bvec.bv_len; - continue; - } - ASSERT(in_range(disk_bytenr, cur, bvec.bv_len)); - if (page->mapping && page->mapping->host && - page->mapping->host == inode) { - ret = 1; - *file_offset_ret = page_offset(page) + bvec.bv_offset + - disk_bytenr - cur; - break; - } - } - return ret; -} - -/* * Lookup the checksum for the read bio in csum tree. * * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise. @@ -386,17 +344,15 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) { struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct extent_io_tree *io_tree = &inode->io_tree; struct bio *bio = &bbio->bio; struct btrfs_path *path; const u32 sectorsize = fs_info->sectorsize; const u32 csum_size = fs_info->csum_size; u32 orig_len = bio->bi_iter.bi_size; u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; - u64 cur_disk_bytenr; const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits; - int count = 0; blk_status_t ret = BLK_STS_OK; + u32 bio_offset = 0; if ((inode->flags & BTRFS_INODE_NODATASUM) || test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) @@ -447,28 +403,14 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) path->skip_locking = 1; } - for (cur_disk_bytenr = orig_disk_bytenr; - cur_disk_bytenr < orig_disk_bytenr + orig_len; - cur_disk_bytenr += (count * sectorsize)) { - u64 search_len = orig_disk_bytenr + orig_len - cur_disk_bytenr; - unsigned int sector_offset; - u8 *csum_dst; - - /* - * Although both cur_disk_bytenr and orig_disk_bytenr is u64, - * we're calculating the offset to the bio start. - * - * Bio size is limited to UINT_MAX, thus unsigned int is large - * enough to contain the raw result, not to mention the right - * shifted result. - */ - ASSERT(cur_disk_bytenr - orig_disk_bytenr < UINT_MAX); - sector_offset = (cur_disk_bytenr - orig_disk_bytenr) >> - fs_info->sectorsize_bits; - csum_dst = bbio->csum + sector_offset * csum_size; + while (bio_offset < orig_len) { + int count; + u64 cur_disk_bytenr = orig_disk_bytenr + bio_offset; + u8 *csum_dst = bbio->csum + + (bio_offset >> fs_info->sectorsize_bits) * csum_size; count = search_csum_tree(fs_info, path, cur_disk_bytenr, - search_len, csum_dst); + orig_len - bio_offset, csum_dst); if (count < 0) { ret = errno_to_blk_status(count); if (bbio->csum != bbio->csum_inline) @@ -493,14 +435,9 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) if (inode->root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { - u64 file_offset; - int ret; - - ret = search_file_offset_in_bio(bio, - &inode->vfs_inode, - cur_disk_bytenr, &file_offset); - if (ret) - set_extent_bits(io_tree, file_offset, + u64 file_offset = bbio->file_offset + bio_offset; + + set_extent_bits(&inode->io_tree, file_offset, file_offset + sectorsize - 1, EXTENT_NODATASUM); } else { @@ -509,6 +446,7 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) cur_disk_bytenr, cur_disk_bytenr + sectorsize); } } + bio_offset += count * sectorsize; } btrfs_free_path(path); @@ -659,7 +597,8 @@ fail: * in is large enough to contain all csums. */ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, - u8 *csum_buf, unsigned long *csum_bitmap) + u8 *csum_buf, unsigned long *csum_bitmap, + bool search_commit) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; @@ -676,6 +615,12 @@ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, if (!path) return -ENOMEM; + if (search_commit) { + path->skip_locking = 1; + path->reada = READA_FORWARD; + path->search_commit_root = 1; + } + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; key.type = BTRFS_EXTENT_CSUM_KEY; key.offset = start; diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index cd7f2ae515c0..6be8725cd574 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -57,7 +57,8 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit, bool nowait); int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, - u8 *csum_buf, unsigned long *csum_bitmap); + u8 *csum_buf, unsigned long *csum_bitmap, + bool search_commit); void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, struct btrfs_file_extent_item *fi, diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 24cd49229408..0d98fc5f6f44 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -25,6 +25,18 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); /* + * Number of metadata items necessary for an unlink operation: + * + * 1 for the possible orphan item + * 1 for the dir item + * 1 for the dir index + * 1 for the inode ref + * 1 for the inode + * 1 for the parent inode + */ +#define BTRFS_UNLINK_METADATA_UNITS 6 + +/* * The reserved space at the beginning of each device. It covers the primary * super block and leaves space for potential use by other tools like * bootloaders or to lower potential damage of accidental overwrite. @@ -193,11 +205,7 @@ enum { #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL -#ifdef CONFIG_BTRFS_DEBUG -/* - * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG - */ -#define BTRFS_FEATURE_INCOMPAT_SUPP \ +#define BTRFS_FEATURE_INCOMPAT_SUPP_STABLE \ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ @@ -210,23 +218,22 @@ enum { BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ - BTRFS_FEATURE_INCOMPAT_ZONED | \ + BTRFS_FEATURE_INCOMPAT_ZONED) + +#ifdef CONFIG_BTRFS_DEBUG + /* + * Features under developmen like Extent tree v2 support is enabled + * only under CONFIG_BTRFS_DEBUG. + */ +#define BTRFS_FEATURE_INCOMPAT_SUPP \ + (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \ BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) + #else -#define BTRFS_FEATURE_INCOMPAT_SUPP \ - (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ - BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ - BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ - BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ - BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ - BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ - BTRFS_FEATURE_INCOMPAT_RAID56 | \ - BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ - BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ - BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ - BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ - BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ - BTRFS_FEATURE_INCOMPAT_ZONED) + +#define BTRFS_FEATURE_INCOMPAT_SUPP \ + (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE) + #endif #define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ @@ -412,7 +419,6 @@ struct btrfs_fs_info { * Must be written and read while holding btrfs_fs_info::commit_root_sem. */ u64 last_reloc_trans; - u64 avg_delayed_ref_runtime; /* * This is updated to the current trans every time a full commit is @@ -638,7 +644,6 @@ struct btrfs_fs_info { refcount_t scrub_workers_refcnt; struct workqueue_struct *scrub_workers; struct workqueue_struct *scrub_wr_completion_workers; - struct workqueue_struct *scrub_parity_workers; struct btrfs_subpage_info *subpage_info; struct btrfs_discard_ctl discard_ctl; @@ -828,7 +833,7 @@ static inline u64 btrfs_csum_bytes_to_leaves( * Use this if we would be adding new items, as we could split nodes as we cow * down the tree. */ -static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info, +static inline u64 btrfs_calc_insert_metadata_size(const struct btrfs_fs_info *fs_info, unsigned num_items) { return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; @@ -838,7 +843,7 @@ static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info, * Doing a truncate or a modification won't result in new nodes or leaves, just * what we need for COW. */ -static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info, +static inline u64 btrfs_calc_metadata_size(const struct btrfs_fs_info *fs_info, unsigned num_items) { return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index b65c45b5d681..4c322b720a80 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -527,7 +527,7 @@ search_again: while (1) { u64 clear_start = 0, clear_len = 0, extent_start = 0; - bool should_throttle = false; + bool refill_delayed_refs_rsv = false; fi = NULL; leaf = path->nodes[0]; @@ -660,8 +660,7 @@ delete: /* No pending yet, add ourselves */ pending_del_slot = path->slots[0]; pending_del_nr = 1; - } else if (pending_del_nr && - path->slots[0] + 1 == pending_del_slot) { + } else if (path->slots[0] + 1 == pending_del_slot) { /* Hop on the pending chunk */ pending_del_nr++; pending_del_slot = path->slots[0]; @@ -686,10 +685,8 @@ delete: btrfs_abort_transaction(trans, ret); break; } - if (be_nice) { - if (btrfs_should_throttle_delayed_refs(trans)) - should_throttle = true; - } + if (be_nice && btrfs_check_space_for_delayed_refs(fs_info)) + refill_delayed_refs_rsv = true; } if (found_type == BTRFS_INODE_ITEM_KEY) @@ -697,7 +694,7 @@ delete: if (path->slots[0] == 0 || path->slots[0] != pending_del_slot || - should_throttle) { + refill_delayed_refs_rsv) { if (pending_del_nr) { ret = btrfs_del_items(trans, root, path, pending_del_slot, @@ -720,7 +717,7 @@ delete: * actually allocate, so just bail if we're short and * let the normal reservation dance happen higher up. */ - if (should_throttle) { + if (refill_delayed_refs_rsv) { ret = btrfs_delayed_refs_rsv_refill(fs_info, BTRFS_RESERVE_NO_FLUSH); if (ret) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 957e4d76a7b6..57d070025c7a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -79,6 +79,7 @@ struct btrfs_iget_args { struct btrfs_dio_data { ssize_t submitted; struct extent_changeset *data_reserved; + struct btrfs_ordered_extent *ordered; bool data_space_reserved; bool nocow_done; }; @@ -669,8 +670,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) again: will_compress = 0; nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; - nr_pages = min_t(unsigned long, nr_pages, - BTRFS_MAX_COMPRESSED / PAGE_SIZE); + nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES); /* * we don't want to send crud past the end of i_size through @@ -945,10 +945,9 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, ret = cow_file_range(inode, locked_page, start, end, &page_started, &nr_written, 0, NULL); /* Inline extent inserted, page gets unlocked and everything is done */ - if (page_started) { - ret = 0; - goto out; - } + if (page_started) + return 0; + if (ret < 0) { btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); if (locked_page) { @@ -962,14 +961,11 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, end_extent_writepage(locked_page, ret, page_start, page_end); unlock_page(locked_page); } - goto out; + return ret; } - ret = extent_write_locked_range(&inode->vfs_inode, start, end); /* All pages will be unlocked, including @locked_page */ -out: - kfree(async_extent); - return ret; + return extent_write_locked_range(&inode->vfs_inode, start, end); } static int submit_one_async_extent(struct btrfs_inode *inode, @@ -987,6 +983,9 @@ static int submit_one_async_extent(struct btrfs_inode *inode, u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; + if (async_chunk->blkcg_css) + kthread_associate_blkcg(async_chunk->blkcg_css); + /* * If async_chunk->locked_page is in the async_extent range, we need to * handle it. @@ -1001,8 +1000,10 @@ static int submit_one_async_extent(struct btrfs_inode *inode, lock_extent(io_tree, start, end, NULL); /* We have fall back to uncompressed write */ - if (!async_extent->pages) - return submit_uncompressed_range(inode, async_extent, locked_page); + if (!async_extent->pages) { + ret = submit_uncompressed_range(inode, async_extent, locked_page); + goto done; + } ret = btrfs_reserve_extent(root, async_extent->ram_size, async_extent->compressed_size, @@ -1054,24 +1055,18 @@ static int submit_one_async_extent(struct btrfs_inode *inode, extent_clear_unlock_delalloc(inode, start, end, NULL, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_START_WRITEBACK); - if (btrfs_submit_compressed_write(inode, start, /* file_offset */ + + btrfs_submit_compressed_write(inode, start, /* file_offset */ async_extent->ram_size, /* num_bytes */ ins.objectid, /* disk_bytenr */ ins.offset, /* compressed_len */ async_extent->pages, /* compressed_pages */ async_extent->nr_pages, - async_chunk->write_flags, - async_chunk->blkcg_css, true)) { - const u64 start = async_extent->start; - const u64 end = start + async_extent->ram_size - 1; - - btrfs_writepage_endio_finish_ordered(inode, NULL, start, end, 0); - - extent_clear_unlock_delalloc(inode, start, end, NULL, 0, - PAGE_END_WRITEBACK | PAGE_SET_ERROR); - free_async_extent_pages(async_extent); - } + async_chunk->write_flags, true); *alloc_hint = ins.objectid + ins.offset; +done: + if (async_chunk->blkcg_css) + kthread_associate_blkcg(NULL); kfree(async_extent); return ret; @@ -1086,8 +1081,7 @@ out_free: PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK | PAGE_SET_ERROR); free_async_extent_pages(async_extent); - kfree(async_extent); - return ret; + goto done; } /* @@ -1622,6 +1616,7 @@ static int cow_file_range_async(struct btrfs_inode *inode, if (blkcg_css != blkcg_root_css) { css_get(blkcg_css); async_chunk[i].blkcg_css = blkcg_css; + async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT; } else { async_chunk[i].blkcg_css = NULL; } @@ -2521,37 +2516,31 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, } /* - * Split an extent_map at [start, start + len] + * Split off the first pre bytes from the extent_map at [start, start + len] * * This function is intended to be used only for extract_ordered_extent(). */ -static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, - u64 pre, u64 post) +static int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre) { struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; struct extent_map *split_pre = NULL; struct extent_map *split_mid = NULL; - struct extent_map *split_post = NULL; int ret = 0; unsigned long flags; - /* Sanity check */ - if (pre == 0 && post == 0) - return 0; + ASSERT(pre != 0); + ASSERT(pre < len); split_pre = alloc_extent_map(); - if (pre) - split_mid = alloc_extent_map(); - if (post) - split_post = alloc_extent_map(); - if (!split_pre || (pre && !split_mid) || (post && !split_post)) { + if (!split_pre) + return -ENOMEM; + split_mid = alloc_extent_map(); + if (!split_mid) { ret = -ENOMEM; - goto out; + goto out_free_pre; } - ASSERT(pre + post < len); - lock_extent(&inode->io_tree, start, start + len - 1, NULL); write_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); @@ -2572,7 +2561,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, /* First, replace the em with a new extent_map starting from * em->start */ split_pre->start = em->start; - split_pre->len = (pre ? pre : em->len - post); + split_pre->len = pre; split_pre->orig_start = split_pre->start; split_pre->block_start = em->block_start; split_pre->block_len = split_pre->len; @@ -2586,38 +2575,21 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, /* * Now we only have an extent_map at: - * [em->start, em->start + pre] if pre != 0 - * [em->start, em->start + em->len - post] if pre == 0 - */ - - if (pre) { - /* Insert the middle extent_map */ - split_mid->start = em->start + pre; - split_mid->len = em->len - pre - post; - split_mid->orig_start = split_mid->start; - split_mid->block_start = em->block_start + pre; - split_mid->block_len = split_mid->len; - split_mid->orig_block_len = split_mid->block_len; - split_mid->ram_bytes = split_mid->len; - split_mid->flags = flags; - split_mid->compress_type = em->compress_type; - split_mid->generation = em->generation; - add_extent_mapping(em_tree, split_mid, 1); - } - - if (post) { - split_post->start = em->start + em->len - post; - split_post->len = post; - split_post->orig_start = split_post->start; - split_post->block_start = em->block_start + em->len - post; - split_post->block_len = split_post->len; - split_post->orig_block_len = split_post->block_len; - split_post->ram_bytes = split_post->len; - split_post->flags = flags; - split_post->compress_type = em->compress_type; - split_post->generation = em->generation; - add_extent_mapping(em_tree, split_post, 1); - } + * [em->start, em->start + pre] + */ + + /* Insert the middle extent_map. */ + split_mid->start = em->start + pre; + split_mid->len = em->len - pre; + split_mid->orig_start = split_mid->start; + split_mid->block_start = em->block_start + pre; + split_mid->block_len = split_mid->len; + split_mid->orig_block_len = split_mid->block_len; + split_mid->ram_bytes = split_mid->len; + split_mid->flags = flags; + split_mid->compress_type = em->compress_type; + split_mid->generation = em->generation; + add_extent_mapping(em_tree, split_mid, 1); /* Once for us */ free_extent_map(em); @@ -2627,72 +2599,41 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, out_unlock: write_unlock(&em_tree->lock); unlock_extent(&inode->io_tree, start, start + len - 1, NULL); -out: - free_extent_map(split_pre); free_extent_map(split_mid); - free_extent_map(split_post); - +out_free_pre: + free_extent_map(split_pre); return ret; } -blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio) +int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, + struct btrfs_ordered_extent *ordered) { u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; u64 len = bbio->bio.bi_iter.bi_size; struct btrfs_inode *inode = bbio->inode; - struct btrfs_ordered_extent *ordered; - u64 file_len; - u64 end = start + len; - u64 ordered_end; - u64 pre, post; + u64 ordered_len = ordered->num_bytes; int ret = 0; - ordered = btrfs_lookup_ordered_extent(inode, bbio->file_offset); - if (WARN_ON_ONCE(!ordered)) - return BLK_STS_IOERR; + /* Must always be called for the beginning of an ordered extent. */ + if (WARN_ON_ONCE(start != ordered->disk_bytenr)) + return -EINVAL; - /* No need to split */ + /* No need to split if the ordered extent covers the entire bio. */ if (ordered->disk_num_bytes == len) - goto out; - - /* We cannot split once end_bio'd ordered extent */ - if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) { - ret = -EINVAL; - goto out; - } - - /* We cannot split a compressed ordered extent */ - if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) { - ret = -EINVAL; - goto out; - } - - ordered_end = ordered->disk_bytenr + ordered->disk_num_bytes; - /* bio must be in one ordered extent */ - if (WARN_ON_ONCE(start < ordered->disk_bytenr || end > ordered_end)) { - ret = -EINVAL; - goto out; - } - - /* Checksum list should be empty */ - if (WARN_ON_ONCE(!list_empty(&ordered->list))) { - ret = -EINVAL; - goto out; - } - - file_len = ordered->num_bytes; - pre = start - ordered->disk_bytenr; - post = ordered_end - end; + return 0; - ret = btrfs_split_ordered_extent(ordered, pre, post); + ret = btrfs_split_ordered_extent(ordered, len); if (ret) - goto out; - ret = split_zoned_em(inode, bbio->file_offset, file_len, pre, post); + return ret; -out: - btrfs_put_ordered_extent(ordered); + /* + * Don't split the extent_map for NOCOW extents, as we're writing into + * a pre-existing one. + */ + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) + return 0; - return errno_to_blk_status(ret); + return split_extent_map(inode, bbio->file_offset, ordered_len, len); } /* @@ -3367,13 +3308,6 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, return 0; } -static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 offset) -{ - u64 offset_in_sectors = offset >> fs_info->sectorsize_bits; - - return csums + offset_in_sectors * fs_info->csum_size; -} - /* * Verify the checksum of a single data sector. * @@ -3411,7 +3345,8 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, return true; } - csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset); + csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) * + fs_info->csum_size; if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum, csum_expected)) goto zeroit; @@ -3691,6 +3626,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); + iput(inode); goto out; } btrfs_debug(fs_info, "auto deleting %Lu", @@ -3698,8 +3634,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) ret = btrfs_del_orphan_item(trans, root, found_key.objectid); btrfs_end_transaction(trans); - if (ret) + if (ret) { + iput(inode); goto out; + } continue; } @@ -4261,15 +4199,8 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir) { struct btrfs_root *root = dir->root; - /* - * 1 for the possible orphan item - * 1 for the dir item - * 1 for the dir index - * 1 for the inode ref - * 1 for the inode - * 1 for the parent inode - */ - return btrfs_start_transaction_fallback_global_rsv(root, 6); + return btrfs_start_transaction_fallback_global_rsv(root, + BTRFS_UNLINK_METADATA_UNITS); } static int btrfs_unlink(struct inode *dir, struct dentry *dentry) @@ -5243,7 +5174,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; - u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1); + u64 delayed_refs_extra = btrfs_calc_delayed_ref_bytes(fs_info, 1); int ret; /* @@ -5281,7 +5212,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, trans->block_rsv = &fs_info->trans_block_rsv; trans->bytes_reserved = delayed_refs_extra; btrfs_block_rsv_migrate(rsv, trans->block_rsv, - delayed_refs_extra, 1); + delayed_refs_extra, true); } return trans; } @@ -5291,7 +5222,7 @@ void btrfs_evict_inode(struct inode *inode) struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_block_rsv *rsv; + struct btrfs_block_rsv *rsv = NULL; int ret; trace_btrfs_inode_evict(inode); @@ -5308,18 +5239,18 @@ void btrfs_evict_inode(struct inode *inode) ((btrfs_root_refs(&root->root_item) != 0 && root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) || btrfs_is_free_space_inode(BTRFS_I(inode)))) - goto no_delete; + goto out; if (is_bad_inode(inode)) - goto no_delete; + goto out; if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) - goto no_delete; + goto out; if (inode->i_nlink > 0) { BUG_ON(btrfs_root_refs(&root->root_item) != 0 && root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID); - goto no_delete; + goto out; } /* @@ -5328,7 +5259,7 @@ void btrfs_evict_inode(struct inode *inode) */ ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode)); if (ret) - goto no_delete; + goto out; /* * This drops any pending insert or delete operations we have for this @@ -5340,7 +5271,7 @@ void btrfs_evict_inode(struct inode *inode) rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); if (!rsv) - goto no_delete; + goto out; rsv->size = btrfs_calc_metadata_size(fs_info, 1); rsv->failfast = true; @@ -5356,16 +5287,21 @@ void btrfs_evict_inode(struct inode *inode) trans = evict_refill_and_join(root, rsv); if (IS_ERR(trans)) - goto free_rsv; + goto out; trans->block_rsv = rsv; ret = btrfs_truncate_inode_items(trans, root, &control); trans->block_rsv = &fs_info->trans_block_rsv; btrfs_end_transaction(trans); - btrfs_btree_balance_dirty(fs_info); + /* + * We have not added new delayed items for our inode after we + * have flushed its delayed items, so no need to throttle on + * delayed items. However we have modified extent buffers. + */ + btrfs_btree_balance_dirty_nodelay(fs_info); if (ret && ret != -ENOSPC && ret != -EAGAIN) - goto free_rsv; + goto out; else if (!ret) break; } @@ -5387,9 +5323,8 @@ void btrfs_evict_inode(struct inode *inode) btrfs_end_transaction(trans); } -free_rsv: +out: btrfs_free_block_rsv(fs_info, rsv); -no_delete: /* * If we didn't successfully delete, the orphan item will still be in * the tree and we'll retry on the next mount. Again, we might also want @@ -6981,6 +6916,7 @@ out: } static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, + struct btrfs_dio_data *dio_data, const u64 start, const u64 len, const u64 orig_start, @@ -6991,7 +6927,7 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, const int type) { struct extent_map *em = NULL; - int ret; + struct btrfs_ordered_extent *ordered; if (type != BTRFS_ORDERED_NOCOW) { em = create_io_em(inode, start, len, orig_start, block_start, @@ -7001,18 +6937,21 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, if (IS_ERR(em)) goto out; } - ret = btrfs_add_ordered_extent(inode, start, len, len, block_start, - block_len, 0, - (1 << type) | - (1 << BTRFS_ORDERED_DIRECT), - BTRFS_COMPRESS_NONE); - if (ret) { + ordered = btrfs_alloc_ordered_extent(inode, start, len, len, + block_start, block_len, 0, + (1 << type) | + (1 << BTRFS_ORDERED_DIRECT), + BTRFS_COMPRESS_NONE); + if (IS_ERR(ordered)) { if (em) { free_extent_map(em); btrfs_drop_extent_map_range(inode, start, start + len - 1, false); } - em = ERR_PTR(ret); + em = ERR_CAST(ordered); + } else { + ASSERT(!dio_data->ordered); + dio_data->ordered = ordered; } out: @@ -7020,6 +6959,7 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, } static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, + struct btrfs_dio_data *dio_data, u64 start, u64 len) { struct btrfs_root *root = inode->root; @@ -7035,7 +6975,7 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, if (ret) return ERR_PTR(ret); - em = btrfs_create_dio_extent(inode, start, ins.offset, start, + em = btrfs_create_dio_extent(inode, dio_data, start, ins.offset, start, ins.objectid, ins.offset, ins.offset, ins.offset, BTRFS_ORDERED_REGULAR); btrfs_dec_block_group_reservations(fs_info, ins.objectid); @@ -7380,7 +7320,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, } space_reserved = true; - em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len, + em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len, orig_start, block_start, len, orig_block_len, ram_bytes, type); @@ -7422,7 +7362,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, goto out; space_reserved = true; - em = btrfs_new_extent_direct(BTRFS_I(inode), start, len); + em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; @@ -7728,6 +7668,10 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, pos + length - 1, NULL); ret = -ENOTBLK; } + if (write) { + btrfs_put_ordered_extent(dio_data->ordered); + dio_data->ordered = NULL; + } if (write) extent_changeset_free(dio_data->data_reserved); @@ -7767,14 +7711,34 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, container_of(bbio, struct btrfs_dio_private, bbio); struct btrfs_dio_data *dio_data = iter->private; - btrfs_bio_init(bbio, BTRFS_I(iter->inode), btrfs_dio_end_io, bio->bi_private); + btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info, + btrfs_dio_end_io, bio->bi_private); + bbio->inode = BTRFS_I(iter->inode); bbio->file_offset = file_offset; dip->file_offset = file_offset; dip->bytes = bio->bi_iter.bi_size; dio_data->submitted += bio->bi_iter.bi_size; - btrfs_submit_bio(bio, 0); + + /* + * Check if we are doing a partial write. If we are, we need to split + * the ordered extent to match the submitted bio. Hang on to the + * remaining unfinishable ordered_extent in dio_data so that it can be + * cancelled in iomap_end to avoid a deadlock wherein faulting the + * remaining pages is blocked on the outstanding ordered extent. + */ + if (iter->flags & IOMAP_WRITE) { + int ret; + + ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered); + if (ret) { + btrfs_bio_end_io(bbio, errno_to_blk_status(ret)); + return; + } + } + + btrfs_submit_bio(bbio, 0); } static const struct iomap_ops btrfs_dio_iomap_ops = { @@ -7789,7 +7753,7 @@ static const struct iomap_dio_ops btrfs_dio_ops = { ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before) { - struct btrfs_dio_data data; + struct btrfs_dio_data data = { 0 }; return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, IOMAP_DIO_PARTIAL, &data, done_before); @@ -7798,7 +7762,7 @@ ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_be struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, size_t done_before) { - struct btrfs_dio_data data; + struct btrfs_dio_data data = { 0 }; return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, IOMAP_DIO_PARTIAL, &data, done_before); @@ -9908,8 +9872,6 @@ out: } struct btrfs_encoded_read_private { - struct btrfs_inode *inode; - u64 file_offset; wait_queue_head_t wait; atomic_t pending; blk_status_t status; @@ -9939,45 +9901,41 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 disk_io_size, struct page **pages) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_encoded_read_private priv = { - .inode = inode, - .file_offset = file_offset, .pending = ATOMIC_INIT(1), }; unsigned long i = 0; - u64 cur = 0; + struct btrfs_bio *bbio; init_waitqueue_head(&priv.wait); - /* Submit bios for the extent, splitting due to bio limits as necessary. */ - while (cur < disk_io_size) { - struct bio *bio = NULL; - u64 remaining = disk_io_size - cur; - - while (bio || remaining) { - size_t bytes = min_t(u64, remaining, PAGE_SIZE); - - if (!bio) { - bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, - inode, - btrfs_encoded_read_endio, - &priv); - bio->bi_iter.bi_sector = - (disk_bytenr + cur) >> SECTOR_SHIFT; - } - if (!bytes || - bio_add_page(bio, pages[i], bytes, 0) < bytes) { - atomic_inc(&priv.pending); - btrfs_submit_bio(bio, 0); - bio = NULL; - continue; - } + bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, + btrfs_encoded_read_endio, &priv); + bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; + bbio->inode = inode; - i++; - cur += bytes; - remaining -= bytes; + do { + size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE); + + if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { + atomic_inc(&priv.pending); + btrfs_submit_bio(bbio, 0); + + bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, + btrfs_encoded_read_endio, &priv); + bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; + bbio->inode = inode; + continue; } - } + + i++; + disk_bytenr += bytes; + disk_io_size -= bytes; + } while (disk_io_size); + + atomic_inc(&priv.pending); + btrfs_submit_bio(bbio, 0); if (atomic_dec_return(&priv.pending)) io_wait_event(priv.wait, !atomic_read(&priv.pending)); @@ -10398,13 +10356,8 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, btrfs_delalloc_release_extents(inode, num_bytes); - if (btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid, - ins.offset, pages, nr_pages, 0, NULL, - false)) { - btrfs_writepage_endio_finish_ordered(inode, pages[0], start, end, 0); - ret = -EIO; - goto out_pages; - } + btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid, + ins.offset, pages, nr_pages, 0, false); ret = orig_count; goto out; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ba769a1eb87a..25833b4eeaf5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3161,6 +3161,11 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg) if (IS_ERR(sa)) return PTR_ERR(sa); + if (sa->flags & ~BTRFS_SCRUB_SUPPORTED_FLAGS) { + ret = -EOPNOTSUPP; + goto out; + } + if (!(sa->flags & BTRFS_SCRUB_READONLY)) { ret = mnt_want_write_file(file); if (ret) diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 870528d87526..3a496b0d3d2b 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -325,24 +325,12 @@ struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root) * acquire the lock. */ -int btrfs_drew_lock_init(struct btrfs_drew_lock *lock) +void btrfs_drew_lock_init(struct btrfs_drew_lock *lock) { - int ret; - - ret = percpu_counter_init(&lock->writers, 0, GFP_KERNEL); - if (ret) - return ret; - atomic_set(&lock->readers, 0); + atomic_set(&lock->writers, 0); init_waitqueue_head(&lock->pending_readers); init_waitqueue_head(&lock->pending_writers); - - return 0; -} - -void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock) -{ - percpu_counter_destroy(&lock->writers); } /* Return true if acquisition is successful, false otherwise */ @@ -351,10 +339,10 @@ bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock) if (atomic_read(&lock->readers)) return false; - percpu_counter_inc(&lock->writers); + atomic_inc(&lock->writers); /* Ensure writers count is updated before we check for pending readers */ - smp_mb(); + smp_mb__after_atomic(); if (atomic_read(&lock->readers)) { btrfs_drew_write_unlock(lock); return false; @@ -374,7 +362,7 @@ void btrfs_drew_write_lock(struct btrfs_drew_lock *lock) void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock) { - percpu_counter_dec(&lock->writers); + atomic_dec(&lock->writers); cond_wake_up(&lock->pending_readers); } @@ -390,8 +378,7 @@ void btrfs_drew_read_lock(struct btrfs_drew_lock *lock) */ smp_mb__after_atomic(); - wait_event(lock->pending_readers, - percpu_counter_sum(&lock->writers) == 0); + wait_event(lock->pending_readers, atomic_read(&lock->writers) == 0); } void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock) diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 11c2269b4b6f..edb9b4a0dba1 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -195,13 +195,12 @@ static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw) struct btrfs_drew_lock { atomic_t readers; - struct percpu_counter writers; + atomic_t writers; wait_queue_head_t pending_writers; wait_queue_head_t pending_readers; }; -int btrfs_drew_lock_init(struct btrfs_drew_lock *lock); -void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock); +void btrfs_drew_lock_init(struct btrfs_drew_lock *lock); void btrfs_drew_write_lock(struct btrfs_drew_lock *lock); bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock); void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock); diff --git a/fs/btrfs/lru_cache.h b/fs/btrfs/lru_cache.h index de3e18bce24a..00328c856be6 100644 --- a/fs/btrfs/lru_cache.h +++ b/fs/btrfs/lru_cache.h @@ -55,11 +55,6 @@ static inline unsigned int btrfs_lru_cache_size(const struct btrfs_lru_cache *ca return cache->size; } -static inline bool btrfs_lru_cache_is_full(const struct btrfs_lru_cache *cache) -{ - return cache->size >= cache->max_size; -} - static inline struct btrfs_lru_cache_entry *btrfs_lru_cache_lru_entry( struct btrfs_lru_cache *cache) { diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 71f6d8302d50..3a095b9c6373 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -17,6 +17,7 @@ #include "compression.h" #include "ctree.h" #include "super.h" +#include "btrfs_inode.h" #define LZO_LEN 4 @@ -329,7 +330,7 @@ static void copy_compressed_segment(struct compressed_bio *cb, int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct workspace *workspace = list_entry(ws, struct workspace, list); - const struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); + const struct btrfs_fs_info *fs_info = cb->bbio.inode->root->fs_info; const u32 sectorsize = fs_info->sectorsize; char *kaddr; int ret; @@ -388,8 +389,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) */ btrfs_err(fs_info, "unexpectedly large lzo segment len %u", seg_len); - ret = -EIO; - goto out; + return -EIO; } /* Copy the compressed segment payload into workspace */ @@ -400,8 +400,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) workspace->buf, &out_len); if (ret != LZO_E_OK) { btrfs_err(fs_info, "failed to decompress"); - ret = -EIO; - goto out; + return -EIO; } /* Copy the data into inode pages */ @@ -410,7 +409,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) /* All data read, exit */ if (ret == 0) - goto out; + return 0; ret = 0; /* Check if the sector has enough space for a segment header */ @@ -421,10 +420,8 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) /* Skip the padding zeros */ cur_in += sector_bytes_left; } -out: - if (!ret) - zero_fill_bio(cb->orig_bio); - return ret; + + return 0; } int lzo_decompress(struct list_head *ws, const u8 *data_in, diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index fde5aaa6e7c9..310a05cf95ef 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -253,7 +253,7 @@ void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, #endif #ifdef CONFIG_BTRFS_ASSERT -void __cold btrfs_assertfail(const char *expr, const char *file, int line) +void __cold __noreturn btrfs_assertfail(const char *expr, const char *file, int line) { pr_err("assertion failed: %s, in %s:%d\n", expr, file, line); BUG(); diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 8c516ee58ff9..ac2d1982ba3d 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -160,7 +160,7 @@ do { \ } while (0) #ifdef CONFIG_BTRFS_ASSERT -void __cold btrfs_assertfail(const char *expr, const char *file, int line); +void __cold __noreturn btrfs_assertfail(const char *expr, const char *file, int line); #define ASSERT(expr) \ (likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__)) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 6c24b69e2d0a..a9778a91511e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -160,14 +160,16 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, * @compress_type: Compression algorithm used for data. * * Most of these parameters correspond to &struct btrfs_file_extent_item. The - * tree is given a single reference on the ordered extent that was inserted. + * tree is given a single reference on the ordered extent that was inserted, and + * the returned pointer is given a second reference. * - * Return: 0 or -ENOMEM. + * Return: the new ordered extent or error pointer. */ -int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, - u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, - u64 disk_num_bytes, u64 offset, unsigned flags, - int compress_type) +struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( + struct btrfs_inode *inode, u64 file_offset, + u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, + u64 disk_num_bytes, u64 offset, unsigned long flags, + int compress_type) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -181,7 +183,7 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, /* For nocow write, we can release the qgroup rsv right now */ ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes); if (ret < 0) - return ret; + return ERR_PTR(ret); ret = 0; } else { /* @@ -190,11 +192,11 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, */ ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes); if (ret < 0) - return ret; + return ERR_PTR(ret); } entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); if (!entry) - return -ENOMEM; + return ERR_PTR(-ENOMEM); entry->file_offset = file_offset; entry->num_bytes = num_bytes; @@ -256,6 +258,32 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, btrfs_mod_outstanding_extents(inode, 1); spin_unlock(&inode->lock); + /* One ref for the returned entry to match semantics of lookup. */ + refcount_inc(&entry->refs); + + return entry; +} + +/* + * Add a new btrfs_ordered_extent for the range, but drop the reference instead + * of returning it to the caller. + */ +int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, + u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, + u64 disk_num_bytes, u64 offset, unsigned long flags, + int compress_type) +{ + struct btrfs_ordered_extent *ordered; + + ordered = btrfs_alloc_ordered_extent(inode, file_offset, num_bytes, + ram_bytes, disk_bytenr, + disk_num_bytes, offset, flags, + compress_type); + + if (IS_ERR(ordered)) + return PTR_ERR(ordered); + btrfs_put_ordered_extent(ordered); + return 0; } @@ -1088,39 +1116,37 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, return false; } - -static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos, - u64 len) -{ - struct inode *inode = ordered->inode; - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; - u64 file_offset = ordered->file_offset + pos; - u64 disk_bytenr = ordered->disk_bytenr + pos; - unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS; - - /* - * The splitting extent is already counted and will be added again in - * btrfs_add_ordered_extent_*(). Subtract len to avoid double counting. - */ - percpu_counter_add_batch(&fs_info->ordered_bytes, -len, - fs_info->delalloc_batch); - WARN_ON_ONCE(flags & (1 << BTRFS_ORDERED_COMPRESSED)); - return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len, - disk_bytenr, len, 0, flags, - ordered->compress_type); -} - -int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, - u64 post) +/* Split out a new ordered extent for this first @len bytes of @ordered. */ +int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len) { struct inode *inode = ordered->inode; struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; - struct rb_node *node; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int ret = 0; + u64 file_offset = ordered->file_offset; + u64 disk_bytenr = ordered->disk_bytenr; + unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS; + struct rb_node *node; trace_btrfs_ordered_extent_split(BTRFS_I(inode), ordered); + ASSERT(!(flags & (1U << BTRFS_ORDERED_COMPRESSED))); + + /* + * The entire bio must be covered by the ordered extent, but we can't + * reduce the original extent to a zero length either. + */ + if (WARN_ON_ONCE(len >= ordered->num_bytes)) + return -EINVAL; + /* We cannot split once ordered extent is past end_bio. */ + if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) + return -EINVAL; + /* We cannot split a compressed ordered extent. */ + if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) + return -EINVAL; + /* Checksum list should be empty. */ + if (WARN_ON_ONCE(!list_empty(&ordered->list))) + return -EINVAL; + spin_lock_irq(&tree->lock); /* Remove from tree once */ node = &ordered->rb_node; @@ -1129,11 +1155,11 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, if (tree->last == node) tree->last = NULL; - ordered->file_offset += pre; - ordered->disk_bytenr += pre; - ordered->num_bytes -= (pre + post); - ordered->disk_num_bytes -= (pre + post); - ordered->bytes_left -= (pre + post); + ordered->file_offset += len; + ordered->disk_bytenr += len; + ordered->num_bytes -= len; + ordered->disk_num_bytes -= len; + ordered->bytes_left -= len; /* Re-insert the node */ node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node); @@ -1144,13 +1170,15 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, spin_unlock_irq(&tree->lock); - if (pre) - ret = clone_ordered_extent(ordered, 0, pre); - if (ret == 0 && post) - ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes, - post); + /* + * The splitting extent is already counted and will be added again in + * btrfs_add_ordered_extent(). Subtract len to avoid double counting. + */ + percpu_counter_add_batch(&fs_info->ordered_bytes, -len, fs_info->delalloc_batch); - return ret; + return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len, + disk_bytenr, len, 0, flags, + ordered->compress_type); } int __init ordered_data_init(void) diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index eb40cb39f842..f0f1138d23c3 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -178,9 +178,14 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size); +struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( + struct btrfs_inode *inode, u64 file_offset, + u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, + u64 disk_num_bytes, u64 offset, unsigned long flags, + int compress_type); int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, - u64 disk_num_bytes, u64 offset, unsigned flags, + u64 disk_num_bytes, u64 offset, unsigned long flags, int compress_type); void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); @@ -207,8 +212,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, struct extent_state **cached_state); bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state); -int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, - u64 post); +int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len); int __init ordered_data_init(void); void __cold ordered_data_exit(void); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 642828c1b299..2fab37f062de 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -202,7 +202,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) */ static int rbio_bucket(struct btrfs_raid_bio *rbio) { - u64 num = rbio->bioc->raid_map[0]; + u64 num = rbio->bioc->full_stripe_logical; /* * we shift down quite a bit. We're using byte @@ -407,16 +407,15 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) { struct btrfs_stripe_hash_table *table; - unsigned long flags; if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) return; table = rbio->bioc->fs_info->stripe_hash_table; - spin_lock_irqsave(&table->cache_lock, flags); + spin_lock(&table->cache_lock); __remove_rbio_from_cache(rbio); - spin_unlock_irqrestore(&table->cache_lock, flags); + spin_unlock(&table->cache_lock); } /* @@ -425,19 +424,18 @@ static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) { struct btrfs_stripe_hash_table *table; - unsigned long flags; struct btrfs_raid_bio *rbio; table = info->stripe_hash_table; - spin_lock_irqsave(&table->cache_lock, flags); + spin_lock(&table->cache_lock); while (!list_empty(&table->stripe_cache)) { rbio = list_entry(table->stripe_cache.next, struct btrfs_raid_bio, stripe_cache); __remove_rbio_from_cache(rbio); } - spin_unlock_irqrestore(&table->cache_lock, flags); + spin_unlock(&table->cache_lock); } /* @@ -467,14 +465,13 @@ void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) static void cache_rbio(struct btrfs_raid_bio *rbio) { struct btrfs_stripe_hash_table *table; - unsigned long flags; if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) return; table = rbio->bioc->fs_info->stripe_hash_table; - spin_lock_irqsave(&table->cache_lock, flags); + spin_lock(&table->cache_lock); spin_lock(&rbio->bio_list_lock); /* bump our ref if we were not in the list before */ @@ -501,7 +498,7 @@ static void cache_rbio(struct btrfs_raid_bio *rbio) __remove_rbio_from_cache(found); } - spin_unlock_irqrestore(&table->cache_lock, flags); + spin_unlock(&table->cache_lock); } /* @@ -530,15 +527,14 @@ static void run_xor(void **pages, int src_cnt, ssize_t len) */ static int rbio_is_full(struct btrfs_raid_bio *rbio) { - unsigned long flags; unsigned long size = rbio->bio_list_bytes; int ret = 1; - spin_lock_irqsave(&rbio->bio_list_lock, flags); + spin_lock(&rbio->bio_list_lock); if (size != rbio->nr_data * BTRFS_STRIPE_LEN) ret = 0; BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN); - spin_unlock_irqrestore(&rbio->bio_list_lock, flags); + spin_unlock(&rbio->bio_list_lock); return ret; } @@ -571,7 +567,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, test_bit(RBIO_CACHE_BIT, &cur->flags)) return 0; - if (last->bioc->raid_map[0] != cur->bioc->raid_map[0]) + if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical) return 0; /* we can't merge with different operations */ @@ -657,16 +653,15 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) struct btrfs_stripe_hash *h; struct btrfs_raid_bio *cur; struct btrfs_raid_bio *pending; - unsigned long flags; struct btrfs_raid_bio *freeit = NULL; struct btrfs_raid_bio *cache_drop = NULL; int ret = 0; h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio); - spin_lock_irqsave(&h->lock, flags); + spin_lock(&h->lock); list_for_each_entry(cur, &h->hash_list, hash_list) { - if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0]) + if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical) continue; spin_lock(&cur->bio_list_lock); @@ -724,7 +719,7 @@ lockit: refcount_inc(&rbio->refs); list_add(&rbio->hash_list, &h->hash_list); out: - spin_unlock_irqrestore(&h->lock, flags); + spin_unlock(&h->lock); if (cache_drop) remove_rbio_from_cache(cache_drop); if (freeit) @@ -742,7 +737,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) { int bucket; struct btrfs_stripe_hash *h; - unsigned long flags; int keep_cache = 0; bucket = rbio_bucket(rbio); @@ -751,7 +745,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) if (list_empty(&rbio->plug_list)) cache_rbio(rbio); - spin_lock_irqsave(&h->lock, flags); + spin_lock(&h->lock); spin_lock(&rbio->bio_list_lock); if (!list_empty(&rbio->hash_list)) { @@ -788,7 +782,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) list_add(&next->hash_list, &h->hash_list); refcount_inc(&next->refs); spin_unlock(&rbio->bio_list_lock); - spin_unlock_irqrestore(&h->lock, flags); + spin_unlock(&h->lock); if (next->operation == BTRFS_RBIO_READ_REBUILD) start_async_work(next, recover_rbio_work_locked); @@ -808,7 +802,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) } done: spin_unlock(&rbio->bio_list_lock); - spin_unlock_irqrestore(&h->lock, flags); + spin_unlock(&h->lock); done_nolock: if (!keep_cache) @@ -891,16 +885,16 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, index = stripe_nr * rbio->stripe_nsectors + sector_nr; ASSERT(index >= 0 && index < rbio->nr_sectors); - spin_lock_irq(&rbio->bio_list_lock); + spin_lock(&rbio->bio_list_lock); sector = &rbio->bio_sectors[index]; if (sector->page || bio_list_only) { /* Don't return sector without a valid page pointer */ if (!sector->page) sector = NULL; - spin_unlock_irq(&rbio->bio_list_lock); + spin_unlock(&rbio->bio_list_lock); return sector; } - spin_unlock_irq(&rbio->bio_list_lock); + spin_unlock(&rbio->bio_list_lock); return &rbio->stripe_sectors[index]; } @@ -912,7 +906,7 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, struct btrfs_io_context *bioc) { - const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs; + const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes; const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT; const unsigned int num_pages = stripe_npages * real_stripes; const unsigned int stripe_nsectors = @@ -1108,7 +1102,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, bio->bi_iter.bi_sector = disk_start >> 9; bio->bi_private = rbio; - bio_add_page(bio, sector->page, sectorsize, sector->pgoff); + __bio_add_page(bio, sector->page, sectorsize, sector->pgoff); bio_list_add(bio_list, bio); return 0; } @@ -1119,7 +1113,7 @@ static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) struct bio_vec bvec; struct bvec_iter iter; u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - - rbio->bioc->raid_map[0]; + rbio->bioc->full_stripe_logical; bio_for_each_segment(bvec, bio, iter) { u32 bvec_offset; @@ -1148,11 +1142,11 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) { struct bio *bio; - spin_lock_irq(&rbio->bio_list_lock); + spin_lock(&rbio->bio_list_lock); bio_list_for_each(bio, &rbio->bio_list) index_one_bio(rbio, bio); - spin_unlock_irq(&rbio->bio_list_lock); + spin_unlock(&rbio->bio_list_lock); } static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio, @@ -1282,10 +1276,16 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, goto error; } - if (likely(!rbio->bioc->num_tgtdevs)) + if (likely(!rbio->bioc->replace_nr_stripes)) return 0; - /* Make a copy for the replace target device. */ + /* + * Make a copy for the replace target device. + * + * Thus the source stripe number (in replace_stripe_src) should be valid. + */ + ASSERT(rbio->bioc->replace_stripe_src >= 0); + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { struct sector_ptr *sector; @@ -1293,7 +1293,12 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, stripe = total_sector_nr / rbio->stripe_nsectors; sectornr = total_sector_nr % rbio->stripe_nsectors; - if (!rbio->bioc->tgtdev_map[stripe]) { + /* + * For RAID56, there is only one device that can be replaced, + * and replace_stripe_src[0] indicates the stripe number we + * need to copy from. + */ + if (stripe != rbio->bioc->replace_stripe_src) { /* * We can skip the whole stripe completely, note * total_sector_nr will be increased by one anyway. @@ -1316,7 +1321,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, } ret = rbio_add_io_sector(rbio, bio_list, sector, - rbio->bioc->tgtdev_map[stripe], + rbio->real_stripes, sectornr, REQ_OP_WRITE); if (ret) goto error; @@ -1332,7 +1337,7 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - - rbio->bioc->raid_map[0]; + rbio->bioc->full_stripe_logical; int total_nr_sector = offset >> fs_info->sectorsize_bits; ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors); @@ -1609,7 +1614,7 @@ static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) { const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT; - const u64 full_stripe_start = rbio->bioc->raid_map[0]; + const u64 full_stripe_start = rbio->bioc->full_stripe_logical; const u32 orig_len = orig_bio->bi_iter.bi_size; const u32 sectorsize = fs_info->sectorsize; u64 cur_logical; @@ -1796,9 +1801,8 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, * here due to a crc mismatch and we can't give them the * data they want. */ - if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) { - if (rbio->bioc->raid_map[faila] == - RAID5_P_STRIPE) + if (failb == rbio->real_stripes - 1) { + if (faila == rbio->real_stripes - 2) /* * Only P and Q are corrupted. * We only care about data stripes recovery, @@ -1812,7 +1816,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, goto pstripe; } - if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) { + if (failb == rbio->real_stripes - 2) { raid6_datap_recov(rbio->real_stripes, sectorsize, faila, pointers); } else { @@ -1895,9 +1899,9 @@ static int recover_sectors(struct btrfs_raid_bio *rbio) if (rbio->operation == BTRFS_RBIO_READ_REBUILD || rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { - spin_lock_irq(&rbio->bio_list_lock); + spin_lock(&rbio->bio_list_lock); set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); - spin_unlock_irq(&rbio->bio_list_lock); + spin_unlock(&rbio->bio_list_lock); } index_rbio_pages(rbio); @@ -2075,8 +2079,8 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; struct btrfs_root *csum_root = btrfs_csum_root(fs_info, - rbio->bioc->raid_map[0]); - const u64 start = rbio->bioc->raid_map[0]; + rbio->bioc->full_stripe_logical); + const u64 start = rbio->bioc->full_stripe_logical; const u32 len = (rbio->nr_data * rbio->stripe_nsectors) << fs_info->sectorsize_bits; int ret; @@ -2109,7 +2113,7 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio) } ret = btrfs_lookup_csums_bitmap(csum_root, start, start + len - 1, - rbio->csum_buf, rbio->csum_bitmap); + rbio->csum_buf, rbio->csum_bitmap, false); if (ret < 0) goto error; if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits)) @@ -2124,7 +2128,7 @@ error: */ btrfs_warn_rl(fs_info, "sub-stripe write for full stripe %llu is not safe, failed to get csum: %d", - rbio->bioc->raid_map[0], ret); + rbio->bioc->full_stripe_logical, ret); no_csum: kfree(rbio->csum_buf); bitmap_free(rbio->csum_bitmap); @@ -2265,9 +2269,9 @@ static void rmw_rbio(struct btrfs_raid_bio *rbio) * bio list any more, anyone else that wants to change this stripe * needs to do their own rmw. */ - spin_lock_irq(&rbio->bio_list_lock); + spin_lock(&rbio->bio_list_lock); set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); - spin_unlock_irq(&rbio->bio_list_lock); + spin_unlock(&rbio->bio_list_lock); bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); @@ -2372,23 +2376,6 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, return rbio; } -/* Used for both parity scrub and missing. */ -void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, - unsigned int pgoff, u64 logical) -{ - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - int stripe_offset; - int index; - - ASSERT(logical >= rbio->bioc->raid_map[0]); - ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] + - BTRFS_STRIPE_LEN * rbio->nr_data); - stripe_offset = (int)(logical - rbio->bioc->raid_map[0]); - index = stripe_offset / sectorsize; - rbio->bio_sectors[index].page = page; - rbio->bio_sectors[index].pgoff = pgoff; -} - /* * We just scrub the parity that we have correct data on the same horizontal, * so we needn't allocate all pages for all the stripes. @@ -2442,7 +2429,11 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) else BUG(); - if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) { + /* + * Replace is running and our P/Q stripe is being replaced, then we + * need to duplicate the final write to replace target. + */ + if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) { is_replace = 1; bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors); } @@ -2544,13 +2535,18 @@ writeback: if (!is_replace) goto submit_write; + /* + * Replace is running and our parity stripe needs to be duplicated to + * the target device. Check we have a valid source stripe number. + */ + ASSERT(rbio->bioc->replace_stripe_src >= 0); for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) { struct sector_ptr *sector; sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); ret = rbio_add_io_sector(rbio, &bio_list, sector, - bioc->tgtdev_map[rbio->scrubp], - sectornr, REQ_OP_WRITE); + rbio->real_stripes, + sectornr, REQ_OP_WRITE); if (ret) goto cleanup; } @@ -2751,33 +2747,3 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) if (!lock_stripe_add(rbio)) start_async_work(rbio, scrub_rbio_work_locked); } - -/* The following code is used for dev replace of a missing RAID 5/6 device. */ - -struct btrfs_raid_bio * -raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc) -{ - struct btrfs_fs_info *fs_info = bioc->fs_info; - struct btrfs_raid_bio *rbio; - - rbio = alloc_rbio(fs_info, bioc); - if (IS_ERR(rbio)) - return NULL; - - rbio->operation = BTRFS_RBIO_REBUILD_MISSING; - bio_list_add(&rbio->bio_list, bio); - /* - * This is a special bio which is used to hold the completion handler - * and make the scrub rbio is similar to the other types - */ - ASSERT(!bio->bi_iter.bi_size); - - set_rbio_range_error(rbio, bio); - - return rbio; -} - -void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio) -{ - start_async_work(rbio, recover_rbio_work); -} diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index df0e0abdeb1f..0f7f31c8cb98 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -170,6 +170,11 @@ static inline int nr_data_stripes(const struct map_lookup *map) return map->num_stripes - btrfs_nr_parity_stripes(map->type); } +static inline int nr_bioc_data_stripes(const struct btrfs_io_context *bioc) +{ + return bioc->num_stripes - btrfs_nr_parity_stripes(bioc->map_type); +} + #define RAID5_P_STRIPE ((u64)-2) #define RAID6_Q_STRIPE ((u64)-1) @@ -182,19 +187,12 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, int mirror_num); void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc); -void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, - unsigned int pgoff, u64 logical); - struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, struct btrfs_io_context *bioc, struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors); void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); -struct btrfs_raid_bio * -raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc); -void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio); - int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index ef13a9d4e370..09b1988d1791 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1266,7 +1266,7 @@ again: level = btrfs_header_level(parent); ASSERT(level >= lowest_level); - ret = btrfs_bin_search(parent, &key, &slot); + ret = btrfs_bin_search(parent, 0, &key, &slot); if (ret < 0) break; if (ret && slot > 0) @@ -2407,7 +2407,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, if (upper->eb && !upper->locked) { if (!lowest) { - ret = btrfs_bin_search(upper->eb, key, &slot); + ret = btrfs_bin_search(upper->eb, 0, key, &slot); if (ret < 0) goto next; BUG_ON(ret); @@ -2441,7 +2441,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, slot = path->slots[upper->level]; btrfs_release_path(path); } else { - ret = btrfs_bin_search(upper->eb, key, &slot); + ret = btrfs_bin_search(upper->eb, 0, key, &slot); if (ret < 0) goto next; BUG_ON(ret); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 69c93ae333f6..836725a19661 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -38,18 +38,14 @@ * - add a mode to also read unallocated space */ -struct scrub_block; struct scrub_ctx; /* - * The following three values only influence the performance. + * The following value only influences the performance. * - * The last one configures the number of parallel and outstanding I/O - * operations. The first one configures an upper limit for the number - * of (dynamically allocated) pages that are added to a bio. + * This determines the batch size for stripe submitted in one go. */ -#define SCRUB_SECTORS_PER_BIO 32 /* 128KiB per bio for 4KiB pages */ -#define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for 4KiB pages */ +#define SCRUB_STRIPES_PER_SCTX 8 /* That would be 8 64K stripe per-device. */ /* * The following value times PAGE_SIZE needs to be large enough to match the @@ -57,128 +53,124 @@ struct scrub_ctx; */ #define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K) -#define SCRUB_MAX_PAGES (DIV_ROUND_UP(BTRFS_MAX_METADATA_BLOCKSIZE, PAGE_SIZE)) +/* Represent one sector and its needed info to verify the content. */ +struct scrub_sector_verification { + bool is_metadata; -/* - * Maximum number of mirrors that can be available for all profiles counting - * the target device of dev-replace as one. During an active device replace - * procedure, the target device of the copy operation is a mirror for the - * filesystem data as well that can be used to read data in order to repair - * read errors on other disks. - * - * Current value is derived from RAID1C4 with 4 copies. - */ -#define BTRFS_MAX_MIRRORS (4 + 1) + union { + /* + * Csum pointer for data csum verification. Should point to a + * sector csum inside scrub_stripe::csums. + * + * NULL if this data sector has no csum. + */ + u8 *csum; -struct scrub_recover { - refcount_t refs; - struct btrfs_io_context *bioc; - u64 map_length; + /* + * Extra info for metadata verification. All sectors inside a + * tree block share the same generation. + */ + u64 generation; + }; }; -struct scrub_sector { - struct scrub_block *sblock; - struct list_head list; - u64 flags; /* extent flags */ - u64 generation; - /* Offset in bytes to @sblock. */ - u32 offset; - atomic_t refs; - unsigned int have_csum:1; - unsigned int io_error:1; - u8 csum[BTRFS_CSUM_SIZE]; - - struct scrub_recover *recover; -}; +enum scrub_stripe_flags { + /* Set when @mirror_num, @dev, @physical and @logical are set. */ + SCRUB_STRIPE_FLAG_INITIALIZED, -struct scrub_bio { - int index; - struct scrub_ctx *sctx; - struct btrfs_device *dev; - struct bio *bio; - blk_status_t status; - u64 logical; - u64 physical; - struct scrub_sector *sectors[SCRUB_SECTORS_PER_BIO]; - int sector_count; - int next_free; - struct work_struct work; -}; + /* Set when the read-repair is finished. */ + SCRUB_STRIPE_FLAG_REPAIR_DONE, -struct scrub_block { /* - * Each page will have its page::private used to record the logical - * bytenr. + * Set for data stripes if it's triggered from P/Q stripe. + * During such scrub, we should not report errors in data stripes, nor + * update the accounting. */ - struct page *pages[SCRUB_MAX_PAGES]; - struct scrub_sector *sectors[SCRUB_MAX_SECTORS_PER_BLOCK]; - struct btrfs_device *dev; - /* Logical bytenr of the sblock */ - u64 logical; - u64 physical; - u64 physical_for_dev_replace; - /* Length of sblock in bytes */ - u32 len; - int sector_count; - int mirror_num; - - atomic_t outstanding_sectors; - refcount_t refs; /* free mem on transition to zero */ - struct scrub_ctx *sctx; - struct scrub_parity *sparity; - struct { - unsigned int header_error:1; - unsigned int checksum_error:1; - unsigned int no_io_error_seen:1; - unsigned int generation_error:1; /* also sets header_error */ - - /* The following is for the data used to check parity */ - /* It is for the data with checksum */ - unsigned int data_corrected:1; - }; - struct work_struct work; + SCRUB_STRIPE_FLAG_NO_REPORT, }; -/* Used for the chunks with parity stripe such RAID5/6 */ -struct scrub_parity { - struct scrub_ctx *sctx; +#define SCRUB_STRIPE_PAGES (BTRFS_STRIPE_LEN / PAGE_SIZE) - struct btrfs_device *scrub_dev; +/* + * Represent one contiguous range with a length of BTRFS_STRIPE_LEN. + */ +struct scrub_stripe { + struct scrub_ctx *sctx; + struct btrfs_block_group *bg; - u64 logic_start; + struct page *pages[SCRUB_STRIPE_PAGES]; + struct scrub_sector_verification *sectors; - u64 logic_end; + struct btrfs_device *dev; + u64 logical; + u64 physical; + + u16 mirror_num; + + /* Should be BTRFS_STRIPE_LEN / sectorsize. */ + u16 nr_sectors; + + /* + * How many data/meta extents are in this stripe. Only for scrub status + * reporting purposes. + */ + u16 nr_data_extents; + u16 nr_meta_extents; + + atomic_t pending_io; + wait_queue_head_t io_wait; + wait_queue_head_t repair_wait; - int nsectors; + /* + * Indicate the states of the stripe. Bits are defined in + * scrub_stripe_flags enum. + */ + unsigned long state; - u32 stripe_len; + /* Indicate which sectors are covered by extent items. */ + unsigned long extent_sector_bitmap; - refcount_t refs; + /* + * The errors hit during the initial read of the stripe. + * + * Would be utilized for error reporting and repair. + */ + unsigned long init_error_bitmap; - struct list_head sectors_list; + /* + * The following error bitmaps are all for the current status. + * Every time we submit a new read, these bitmaps may be updated. + * + * error_bitmap = io_error_bitmap | csum_error_bitmap | meta_error_bitmap; + * + * IO and csum errors can happen for both metadata and data. + */ + unsigned long error_bitmap; + unsigned long io_error_bitmap; + unsigned long csum_error_bitmap; + unsigned long meta_error_bitmap; - /* Work of parity check and repair */ - struct work_struct work; + /* For writeback (repair or replace) error reporting. */ + unsigned long write_error_bitmap; - /* Mark the parity blocks which have data */ - unsigned long dbitmap; + /* Writeback can be concurrent, thus we need to protect the bitmap. */ + spinlock_t write_error_lock; /* - * Mark the parity blocks which have data, but errors happen when - * read data or check data + * Checksum for the whole stripe if this stripe is inside a data block + * group. */ - unsigned long ebitmap; + u8 *csums; + + struct work_struct work; }; struct scrub_ctx { - struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX]; + struct scrub_stripe stripes[SCRUB_STRIPES_PER_SCTX]; + struct scrub_stripe *raid56_data_stripes; struct btrfs_fs_info *fs_info; int first_free; - int curr; - atomic_t bios_in_flight; - atomic_t workers_pending; - spinlock_t list_lock; - wait_queue_head_t list_wait; + int cur_stripe; struct list_head csum_list; atomic_t cancel_req; int readonly; @@ -191,10 +183,8 @@ struct scrub_ctx { int is_dev_replace; u64 write_pointer; - struct scrub_bio *wr_curr_bio; struct mutex wr_lock; struct btrfs_device *wr_tgtdev; - bool flush_all_writes; /* * statistics @@ -221,239 +211,66 @@ struct scrub_warning { struct btrfs_device *dev; }; -struct full_stripe_lock { - struct rb_node node; - u64 logical; - u64 refs; - struct mutex mutex; -}; - -#ifndef CONFIG_64BIT -/* This structure is for architectures whose (void *) is smaller than u64 */ -struct scrub_page_private { - u64 logical; -}; -#endif - -static int attach_scrub_page_private(struct page *page, u64 logical) -{ -#ifdef CONFIG_64BIT - attach_page_private(page, (void *)logical); - return 0; -#else - struct scrub_page_private *spp; - - spp = kmalloc(sizeof(*spp), GFP_KERNEL); - if (!spp) - return -ENOMEM; - spp->logical = logical; - attach_page_private(page, (void *)spp); - return 0; -#endif -} - -static void detach_scrub_page_private(struct page *page) -{ -#ifdef CONFIG_64BIT - detach_page_private(page); - return; -#else - struct scrub_page_private *spp; - - spp = detach_page_private(page); - kfree(spp); - return; -#endif -} - -static struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx, - struct btrfs_device *dev, - u64 logical, u64 physical, - u64 physical_for_dev_replace, - int mirror_num) -{ - struct scrub_block *sblock; - - sblock = kzalloc(sizeof(*sblock), GFP_KERNEL); - if (!sblock) - return NULL; - refcount_set(&sblock->refs, 1); - sblock->sctx = sctx; - sblock->logical = logical; - sblock->physical = physical; - sblock->physical_for_dev_replace = physical_for_dev_replace; - sblock->dev = dev; - sblock->mirror_num = mirror_num; - sblock->no_io_error_seen = 1; - /* - * Scrub_block::pages will be allocated at alloc_scrub_sector() when - * the corresponding page is not allocated. - */ - return sblock; -} - -/* - * Allocate a new scrub sector and attach it to @sblock. - * - * Will also allocate new pages for @sblock if needed. - */ -static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock, - u64 logical) +static void release_scrub_stripe(struct scrub_stripe *stripe) { - const pgoff_t page_index = (logical - sblock->logical) >> PAGE_SHIFT; - struct scrub_sector *ssector; - - /* We must never have scrub_block exceed U32_MAX in size. */ - ASSERT(logical - sblock->logical < U32_MAX); - - ssector = kzalloc(sizeof(*ssector), GFP_KERNEL); - if (!ssector) - return NULL; - - /* Allocate a new page if the slot is not allocated */ - if (!sblock->pages[page_index]) { - int ret; + if (!stripe) + return; - sblock->pages[page_index] = alloc_page(GFP_KERNEL); - if (!sblock->pages[page_index]) { - kfree(ssector); - return NULL; - } - ret = attach_scrub_page_private(sblock->pages[page_index], - sblock->logical + (page_index << PAGE_SHIFT)); - if (ret < 0) { - kfree(ssector); - __free_page(sblock->pages[page_index]); - sblock->pages[page_index] = NULL; - return NULL; - } + for (int i = 0; i < SCRUB_STRIPE_PAGES; i++) { + if (stripe->pages[i]) + __free_page(stripe->pages[i]); + stripe->pages[i] = NULL; } - - atomic_set(&ssector->refs, 1); - ssector->sblock = sblock; - /* The sector to be added should not be used */ - ASSERT(sblock->sectors[sblock->sector_count] == NULL); - ssector->offset = logical - sblock->logical; - - /* The sector count must be smaller than the limit */ - ASSERT(sblock->sector_count < SCRUB_MAX_SECTORS_PER_BLOCK); - - sblock->sectors[sblock->sector_count] = ssector; - sblock->sector_count++; - sblock->len += sblock->sctx->fs_info->sectorsize; - - return ssector; -} - -static struct page *scrub_sector_get_page(struct scrub_sector *ssector) -{ - struct scrub_block *sblock = ssector->sblock; - pgoff_t index; - /* - * When calling this function, ssector must be alreaday attached to the - * parent sblock. - */ - ASSERT(sblock); - - /* The range should be inside the sblock range */ - ASSERT(ssector->offset < sblock->len); - - index = ssector->offset >> PAGE_SHIFT; - ASSERT(index < SCRUB_MAX_PAGES); - ASSERT(sblock->pages[index]); - ASSERT(PagePrivate(sblock->pages[index])); - return sblock->pages[index]; + kfree(stripe->sectors); + kfree(stripe->csums); + stripe->sectors = NULL; + stripe->csums = NULL; + stripe->sctx = NULL; + stripe->state = 0; } -static unsigned int scrub_sector_get_page_offset(struct scrub_sector *ssector) +static int init_scrub_stripe(struct btrfs_fs_info *fs_info, + struct scrub_stripe *stripe) { - struct scrub_block *sblock = ssector->sblock; + int ret; - /* - * When calling this function, ssector must be already attached to the - * parent sblock. - */ - ASSERT(sblock); + memset(stripe, 0, sizeof(*stripe)); - /* The range should be inside the sblock range */ - ASSERT(ssector->offset < sblock->len); + stripe->nr_sectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; + stripe->state = 0; - return offset_in_page(ssector->offset); -} + init_waitqueue_head(&stripe->io_wait); + init_waitqueue_head(&stripe->repair_wait); + atomic_set(&stripe->pending_io, 0); + spin_lock_init(&stripe->write_error_lock); -static char *scrub_sector_get_kaddr(struct scrub_sector *ssector) -{ - return page_address(scrub_sector_get_page(ssector)) + - scrub_sector_get_page_offset(ssector); + ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages); + if (ret < 0) + goto error; + + stripe->sectors = kcalloc(stripe->nr_sectors, + sizeof(struct scrub_sector_verification), + GFP_KERNEL); + if (!stripe->sectors) + goto error; + + stripe->csums = kcalloc(BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits, + fs_info->csum_size, GFP_KERNEL); + if (!stripe->csums) + goto error; + return 0; +error: + release_scrub_stripe(stripe); + return -ENOMEM; } -static int bio_add_scrub_sector(struct bio *bio, struct scrub_sector *ssector, - unsigned int len) +static void wait_scrub_stripe_io(struct scrub_stripe *stripe) { - return bio_add_page(bio, scrub_sector_get_page(ssector), len, - scrub_sector_get_page_offset(ssector)); + wait_event(stripe->io_wait, atomic_read(&stripe->pending_io) == 0); } -static int scrub_setup_recheck_block(struct scrub_block *original_sblock, - struct scrub_block *sblocks_for_recheck[]); -static void scrub_recheck_block(struct btrfs_fs_info *fs_info, - struct scrub_block *sblock, - int retry_failed_mirror); -static void scrub_recheck_block_checksum(struct scrub_block *sblock); -static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, - struct scrub_block *sblock_good); -static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad, - struct scrub_block *sblock_good, - int sector_num, int force_write); -static void scrub_write_block_to_dev_replace(struct scrub_block *sblock); -static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, - int sector_num); -static int scrub_checksum_data(struct scrub_block *sblock); -static int scrub_checksum_tree_block(struct scrub_block *sblock); -static int scrub_checksum_super(struct scrub_block *sblock); -static void scrub_block_put(struct scrub_block *sblock); -static void scrub_sector_get(struct scrub_sector *sector); -static void scrub_sector_put(struct scrub_sector *sector); -static void scrub_parity_get(struct scrub_parity *sparity); -static void scrub_parity_put(struct scrub_parity *sparity); -static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len, - u64 physical, struct btrfs_device *dev, u64 flags, - u64 gen, int mirror_num, u8 *csum, - u64 physical_for_dev_replace); -static void scrub_bio_end_io(struct bio *bio); -static void scrub_bio_end_io_worker(struct work_struct *work); -static void scrub_block_complete(struct scrub_block *sblock); -static void scrub_find_good_copy(struct btrfs_fs_info *fs_info, - u64 extent_logical, u32 extent_len, - u64 *extent_physical, - struct btrfs_device **extent_dev, - int *extent_mirror_num); -static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx, - struct scrub_sector *sector); -static void scrub_wr_submit(struct scrub_ctx *sctx); -static void scrub_wr_bio_end_io(struct bio *bio); -static void scrub_wr_bio_end_io_worker(struct work_struct *work); static void scrub_put_ctx(struct scrub_ctx *sctx); -static inline int scrub_is_page_on_raid56(struct scrub_sector *sector) -{ - return sector->recover && - (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); -} - -static void scrub_pending_bio_inc(struct scrub_ctx *sctx) -{ - refcount_inc(&sctx->refs); - atomic_inc(&sctx->bios_in_flight); -} - -static void scrub_pending_bio_dec(struct scrub_ctx *sctx) -{ - atomic_dec(&sctx->bios_in_flight); - wake_up(&sctx->list_wait); - scrub_put_ctx(sctx); -} - static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) { while (atomic_read(&fs_info->scrub_pause_req)) { @@ -486,223 +303,6 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) scrub_pause_off(fs_info); } -/* - * Insert new full stripe lock into full stripe locks tree - * - * Return pointer to existing or newly inserted full_stripe_lock structure if - * everything works well. - * Return ERR_PTR(-ENOMEM) if we failed to allocate memory - * - * NOTE: caller must hold full_stripe_locks_root->lock before calling this - * function - */ -static struct full_stripe_lock *insert_full_stripe_lock( - struct btrfs_full_stripe_locks_tree *locks_root, - u64 fstripe_logical) -{ - struct rb_node **p; - struct rb_node *parent = NULL; - struct full_stripe_lock *entry; - struct full_stripe_lock *ret; - - lockdep_assert_held(&locks_root->lock); - - p = &locks_root->root.rb_node; - while (*p) { - parent = *p; - entry = rb_entry(parent, struct full_stripe_lock, node); - if (fstripe_logical < entry->logical) { - p = &(*p)->rb_left; - } else if (fstripe_logical > entry->logical) { - p = &(*p)->rb_right; - } else { - entry->refs++; - return entry; - } - } - - /* - * Insert new lock. - */ - ret = kmalloc(sizeof(*ret), GFP_KERNEL); - if (!ret) - return ERR_PTR(-ENOMEM); - ret->logical = fstripe_logical; - ret->refs = 1; - mutex_init(&ret->mutex); - - rb_link_node(&ret->node, parent, p); - rb_insert_color(&ret->node, &locks_root->root); - return ret; -} - -/* - * Search for a full stripe lock of a block group - * - * Return pointer to existing full stripe lock if found - * Return NULL if not found - */ -static struct full_stripe_lock *search_full_stripe_lock( - struct btrfs_full_stripe_locks_tree *locks_root, - u64 fstripe_logical) -{ - struct rb_node *node; - struct full_stripe_lock *entry; - - lockdep_assert_held(&locks_root->lock); - - node = locks_root->root.rb_node; - while (node) { - entry = rb_entry(node, struct full_stripe_lock, node); - if (fstripe_logical < entry->logical) - node = node->rb_left; - else if (fstripe_logical > entry->logical) - node = node->rb_right; - else - return entry; - } - return NULL; -} - -/* - * Helper to get full stripe logical from a normal bytenr. - * - * Caller must ensure @cache is a RAID56 block group. - */ -static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr) -{ - u64 ret; - - /* - * Due to chunk item size limit, full stripe length should not be - * larger than U32_MAX. Just a sanity check here. - */ - WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX); - - /* - * round_down() can only handle power of 2, while RAID56 full - * stripe length can be 64KiB * n, so we need to manually round down. - */ - ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) * - cache->full_stripe_len + cache->start; - return ret; -} - -/* - * Lock a full stripe to avoid concurrency of recovery and read - * - * It's only used for profiles with parities (RAID5/6), for other profiles it - * does nothing. - * - * Return 0 if we locked full stripe covering @bytenr, with a mutex held. - * So caller must call unlock_full_stripe() at the same context. - * - * Return <0 if encounters error. - */ -static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, - bool *locked_ret) -{ - struct btrfs_block_group *bg_cache; - struct btrfs_full_stripe_locks_tree *locks_root; - struct full_stripe_lock *existing; - u64 fstripe_start; - int ret = 0; - - *locked_ret = false; - bg_cache = btrfs_lookup_block_group(fs_info, bytenr); - if (!bg_cache) { - ASSERT(0); - return -ENOENT; - } - - /* Profiles not based on parity don't need full stripe lock */ - if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) - goto out; - locks_root = &bg_cache->full_stripe_locks_root; - - fstripe_start = get_full_stripe_logical(bg_cache, bytenr); - - /* Now insert the full stripe lock */ - mutex_lock(&locks_root->lock); - existing = insert_full_stripe_lock(locks_root, fstripe_start); - mutex_unlock(&locks_root->lock); - if (IS_ERR(existing)) { - ret = PTR_ERR(existing); - goto out; - } - mutex_lock(&existing->mutex); - *locked_ret = true; -out: - btrfs_put_block_group(bg_cache); - return ret; -} - -/* - * Unlock a full stripe. - * - * NOTE: Caller must ensure it's the same context calling corresponding - * lock_full_stripe(). - * - * Return 0 if we unlock full stripe without problem. - * Return <0 for error - */ -static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, - bool locked) -{ - struct btrfs_block_group *bg_cache; - struct btrfs_full_stripe_locks_tree *locks_root; - struct full_stripe_lock *fstripe_lock; - u64 fstripe_start; - bool freeit = false; - int ret = 0; - - /* If we didn't acquire full stripe lock, no need to continue */ - if (!locked) - return 0; - - bg_cache = btrfs_lookup_block_group(fs_info, bytenr); - if (!bg_cache) { - ASSERT(0); - return -ENOENT; - } - if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) - goto out; - - locks_root = &bg_cache->full_stripe_locks_root; - fstripe_start = get_full_stripe_logical(bg_cache, bytenr); - - mutex_lock(&locks_root->lock); - fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start); - /* Unpaired unlock_full_stripe() detected */ - if (!fstripe_lock) { - WARN_ON(1); - ret = -ENOENT; - mutex_unlock(&locks_root->lock); - goto out; - } - - if (fstripe_lock->refs == 0) { - WARN_ON(1); - btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow", - fstripe_lock->logical); - } else { - fstripe_lock->refs--; - } - - if (fstripe_lock->refs == 0) { - rb_erase(&fstripe_lock->node, &locks_root->root); - freeit = true; - } - mutex_unlock(&locks_root->lock); - - mutex_unlock(&fstripe_lock->mutex); - if (freeit) - kfree(fstripe_lock); -out: - btrfs_put_block_group(bg_cache); - return ret; -} - static void scrub_free_csums(struct scrub_ctx *sctx) { while (!list_empty(&sctx->csum_list)) { @@ -721,24 +321,9 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) if (!sctx) return; - /* this can happen when scrub is cancelled */ - if (sctx->curr != -1) { - struct scrub_bio *sbio = sctx->bios[sctx->curr]; - - for (i = 0; i < sbio->sector_count; i++) - scrub_block_put(sbio->sectors[i]->sblock); - bio_put(sbio->bio); - } + for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) + release_scrub_stripe(&sctx->stripes[i]); - for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { - struct scrub_bio *sbio = sctx->bios[i]; - - if (!sbio) - break; - kfree(sbio); - } - - kfree(sctx->wr_curr_bio); scrub_free_csums(sctx); kfree(sctx); } @@ -760,45 +345,26 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( goto nomem; refcount_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; - sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO; - sctx->curr = -1; sctx->fs_info = fs_info; INIT_LIST_HEAD(&sctx->csum_list); - for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { - struct scrub_bio *sbio; + for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) { + int ret; - sbio = kzalloc(sizeof(*sbio), GFP_KERNEL); - if (!sbio) + ret = init_scrub_stripe(fs_info, &sctx->stripes[i]); + if (ret < 0) goto nomem; - sctx->bios[i] = sbio; - - sbio->index = i; - sbio->sctx = sctx; - sbio->sector_count = 0; - INIT_WORK(&sbio->work, scrub_bio_end_io_worker); - - if (i != SCRUB_BIOS_PER_SCTX - 1) - sctx->bios[i]->next_free = i + 1; - else - sctx->bios[i]->next_free = -1; + sctx->stripes[i].sctx = sctx; } sctx->first_free = 0; - atomic_set(&sctx->bios_in_flight, 0); - atomic_set(&sctx->workers_pending, 0); atomic_set(&sctx->cancel_req, 0); - spin_lock_init(&sctx->list_lock); spin_lock_init(&sctx->stat_lock); - init_waitqueue_head(&sctx->list_wait); sctx->throttle_deadline = 0; - WARN_ON(sctx->wr_curr_bio != NULL); mutex_init(&sctx->wr_lock); - sctx->wr_curr_bio = NULL; if (is_dev_replace) { WARN_ON(!fs_info->dev_replace.tgtdev); sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; - sctx->flush_all_writes = false; } return sctx; @@ -898,10 +464,10 @@ err: return 0; } -static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) +static void scrub_print_common_warning(const char *errstr, struct btrfs_device *dev, + bool is_super, u64 logical, u64 physical) { - struct btrfs_device *dev; - struct btrfs_fs_info *fs_info; + struct btrfs_fs_info *fs_info = dev->fs_info; struct btrfs_path *path; struct btrfs_key found_key; struct extent_buffer *eb; @@ -914,22 +480,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) u8 ref_level = 0; int ret; - WARN_ON(sblock->sector_count < 1); - dev = sblock->dev; - fs_info = sblock->sctx->fs_info; - /* Super block error, no need to search extent tree. */ - if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { + if (is_super) { btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu", - errstr, btrfs_dev_name(dev), sblock->physical); + errstr, btrfs_dev_name(dev), physical); return; } path = btrfs_alloc_path(); if (!path) return; - swarn.physical = sblock->physical; - swarn.logical = sblock->logical; + swarn.physical = physical; + swarn.logical = logical; swarn.errstr = errstr; swarn.dev = NULL; @@ -978,447 +540,6 @@ out: btrfs_free_path(path); } -static inline void scrub_get_recover(struct scrub_recover *recover) -{ - refcount_inc(&recover->refs); -} - -static inline void scrub_put_recover(struct btrfs_fs_info *fs_info, - struct scrub_recover *recover) -{ - if (refcount_dec_and_test(&recover->refs)) { - btrfs_bio_counter_dec(fs_info); - btrfs_put_bioc(recover->bioc); - kfree(recover); - } -} - -/* - * scrub_handle_errored_block gets called when either verification of the - * sectors failed or the bio failed to read, e.g. with EIO. In the latter - * case, this function handles all sectors in the bio, even though only one - * may be bad. - * The goal of this function is to repair the errored block by using the - * contents of one of the mirrors. - */ -static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) -{ - struct scrub_ctx *sctx = sblock_to_check->sctx; - struct btrfs_device *dev = sblock_to_check->dev; - struct btrfs_fs_info *fs_info; - u64 logical; - unsigned int failed_mirror_index; - unsigned int is_metadata; - unsigned int have_csum; - /* One scrub_block for each mirror */ - struct scrub_block *sblocks_for_recheck[BTRFS_MAX_MIRRORS] = { 0 }; - struct scrub_block *sblock_bad; - int ret; - int mirror_index; - int sector_num; - int success; - bool full_stripe_locked; - unsigned int nofs_flag; - static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); - - BUG_ON(sblock_to_check->sector_count < 1); - fs_info = sctx->fs_info; - if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { - /* - * If we find an error in a super block, we just report it. - * They will get written with the next transaction commit - * anyway - */ - scrub_print_warning("super block error", sblock_to_check); - spin_lock(&sctx->stat_lock); - ++sctx->stat.super_errors; - spin_unlock(&sctx->stat_lock); - btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); - return 0; - } - logical = sblock_to_check->logical; - ASSERT(sblock_to_check->mirror_num); - failed_mirror_index = sblock_to_check->mirror_num - 1; - is_metadata = !(sblock_to_check->sectors[0]->flags & - BTRFS_EXTENT_FLAG_DATA); - have_csum = sblock_to_check->sectors[0]->have_csum; - - if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical)) - return 0; - - /* - * We must use GFP_NOFS because the scrub task might be waiting for a - * worker task executing this function and in turn a transaction commit - * might be waiting the scrub task to pause (which needs to wait for all - * the worker tasks to complete before pausing). - * We do allocations in the workers through insert_full_stripe_lock() - * and scrub_add_sector_to_wr_bio(), which happens down the call chain of - * this function. - */ - nofs_flag = memalloc_nofs_save(); - /* - * For RAID5/6, race can happen for a different device scrub thread. - * For data corruption, Parity and Data threads will both try - * to recovery the data. - * Race can lead to doubly added csum error, or even unrecoverable - * error. - */ - ret = lock_full_stripe(fs_info, logical, &full_stripe_locked); - if (ret < 0) { - memalloc_nofs_restore(nofs_flag); - spin_lock(&sctx->stat_lock); - if (ret == -ENOMEM) - sctx->stat.malloc_errors++; - sctx->stat.read_errors++; - sctx->stat.uncorrectable_errors++; - spin_unlock(&sctx->stat_lock); - return ret; - } - - /* - * read all mirrors one after the other. This includes to - * re-read the extent or metadata block that failed (that was - * the cause that this fixup code is called) another time, - * sector by sector this time in order to know which sectors - * caused I/O errors and which ones are good (for all mirrors). - * It is the goal to handle the situation when more than one - * mirror contains I/O errors, but the errors do not - * overlap, i.e. the data can be repaired by selecting the - * sectors from those mirrors without I/O error on the - * particular sectors. One example (with blocks >= 2 * sectorsize) - * would be that mirror #1 has an I/O error on the first sector, - * the second sector is good, and mirror #2 has an I/O error on - * the second sector, but the first sector is good. - * Then the first sector of the first mirror can be repaired by - * taking the first sector of the second mirror, and the - * second sector of the second mirror can be repaired by - * copying the contents of the 2nd sector of the 1st mirror. - * One more note: if the sectors of one mirror contain I/O - * errors, the checksum cannot be verified. In order to get - * the best data for repairing, the first attempt is to find - * a mirror without I/O errors and with a validated checksum. - * Only if this is not possible, the sectors are picked from - * mirrors with I/O errors without considering the checksum. - * If the latter is the case, at the end, the checksum of the - * repaired area is verified in order to correctly maintain - * the statistics. - */ - for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) { - /* - * Note: the two members refs and outstanding_sectors are not - * used in the blocks that are used for the recheck procedure. - * - * But alloc_scrub_block() will initialize sblock::ref anyway, - * so we can use scrub_block_put() to clean them up. - * - * And here we don't setup the physical/dev for the sblock yet, - * they will be correctly initialized in scrub_setup_recheck_block(). - */ - sblocks_for_recheck[mirror_index] = alloc_scrub_block(sctx, NULL, - logical, 0, 0, mirror_index); - if (!sblocks_for_recheck[mirror_index]) { - spin_lock(&sctx->stat_lock); - sctx->stat.malloc_errors++; - sctx->stat.read_errors++; - sctx->stat.uncorrectable_errors++; - spin_unlock(&sctx->stat_lock); - btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); - goto out; - } - } - - /* Setup the context, map the logical blocks and alloc the sectors */ - ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck); - if (ret) { - spin_lock(&sctx->stat_lock); - sctx->stat.read_errors++; - sctx->stat.uncorrectable_errors++; - spin_unlock(&sctx->stat_lock); - btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); - goto out; - } - BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); - sblock_bad = sblocks_for_recheck[failed_mirror_index]; - - /* build and submit the bios for the failed mirror, check checksums */ - scrub_recheck_block(fs_info, sblock_bad, 1); - - if (!sblock_bad->header_error && !sblock_bad->checksum_error && - sblock_bad->no_io_error_seen) { - /* - * The error disappeared after reading sector by sector, or - * the area was part of a huge bio and other parts of the - * bio caused I/O errors, or the block layer merged several - * read requests into one and the error is caused by a - * different bio (usually one of the two latter cases is - * the cause) - */ - spin_lock(&sctx->stat_lock); - sctx->stat.unverified_errors++; - sblock_to_check->data_corrected = 1; - spin_unlock(&sctx->stat_lock); - - if (sctx->is_dev_replace) - scrub_write_block_to_dev_replace(sblock_bad); - goto out; - } - - if (!sblock_bad->no_io_error_seen) { - spin_lock(&sctx->stat_lock); - sctx->stat.read_errors++; - spin_unlock(&sctx->stat_lock); - if (__ratelimit(&rs)) - scrub_print_warning("i/o error", sblock_to_check); - btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); - } else if (sblock_bad->checksum_error) { - spin_lock(&sctx->stat_lock); - sctx->stat.csum_errors++; - spin_unlock(&sctx->stat_lock); - if (__ratelimit(&rs)) - scrub_print_warning("checksum error", sblock_to_check); - btrfs_dev_stat_inc_and_print(dev, - BTRFS_DEV_STAT_CORRUPTION_ERRS); - } else if (sblock_bad->header_error) { - spin_lock(&sctx->stat_lock); - sctx->stat.verify_errors++; - spin_unlock(&sctx->stat_lock); - if (__ratelimit(&rs)) - scrub_print_warning("checksum/header error", - sblock_to_check); - if (sblock_bad->generation_error) - btrfs_dev_stat_inc_and_print(dev, - BTRFS_DEV_STAT_GENERATION_ERRS); - else - btrfs_dev_stat_inc_and_print(dev, - BTRFS_DEV_STAT_CORRUPTION_ERRS); - } - - if (sctx->readonly) { - ASSERT(!sctx->is_dev_replace); - goto out; - } - - /* - * now build and submit the bios for the other mirrors, check - * checksums. - * First try to pick the mirror which is completely without I/O - * errors and also does not have a checksum error. - * If one is found, and if a checksum is present, the full block - * that is known to contain an error is rewritten. Afterwards - * the block is known to be corrected. - * If a mirror is found which is completely correct, and no - * checksum is present, only those sectors are rewritten that had - * an I/O error in the block to be repaired, since it cannot be - * determined, which copy of the other sectors is better (and it - * could happen otherwise that a correct sector would be - * overwritten by a bad one). - */ - for (mirror_index = 0; ;mirror_index++) { - struct scrub_block *sblock_other; - - if (mirror_index == failed_mirror_index) - continue; - - /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */ - if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) { - if (mirror_index >= BTRFS_MAX_MIRRORS) - break; - if (!sblocks_for_recheck[mirror_index]->sector_count) - break; - - sblock_other = sblocks_for_recheck[mirror_index]; - } else { - struct scrub_recover *r = sblock_bad->sectors[0]->recover; - int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs; - - if (mirror_index >= max_allowed) - break; - if (!sblocks_for_recheck[1]->sector_count) - break; - - ASSERT(failed_mirror_index == 0); - sblock_other = sblocks_for_recheck[1]; - sblock_other->mirror_num = 1 + mirror_index; - } - - /* build and submit the bios, check checksums */ - scrub_recheck_block(fs_info, sblock_other, 0); - - if (!sblock_other->header_error && - !sblock_other->checksum_error && - sblock_other->no_io_error_seen) { - if (sctx->is_dev_replace) { - scrub_write_block_to_dev_replace(sblock_other); - goto corrected_error; - } else { - ret = scrub_repair_block_from_good_copy( - sblock_bad, sblock_other); - if (!ret) - goto corrected_error; - } - } - } - - if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace) - goto did_not_correct_error; - - /* - * In case of I/O errors in the area that is supposed to be - * repaired, continue by picking good copies of those sectors. - * Select the good sectors from mirrors to rewrite bad sectors from - * the area to fix. Afterwards verify the checksum of the block - * that is supposed to be repaired. This verification step is - * only done for the purpose of statistic counting and for the - * final scrub report, whether errors remain. - * A perfect algorithm could make use of the checksum and try - * all possible combinations of sectors from the different mirrors - * until the checksum verification succeeds. For example, when - * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector - * of mirror #2 is readable but the final checksum test fails, - * then the 2nd sector of mirror #3 could be tried, whether now - * the final checksum succeeds. But this would be a rare - * exception and is therefore not implemented. At least it is - * avoided that the good copy is overwritten. - * A more useful improvement would be to pick the sectors - * without I/O error based on sector sizes (512 bytes on legacy - * disks) instead of on sectorsize. Then maybe 512 byte of one - * mirror could be repaired by taking 512 byte of a different - * mirror, even if other 512 byte sectors in the same sectorsize - * area are unreadable. - */ - success = 1; - for (sector_num = 0; sector_num < sblock_bad->sector_count; - sector_num++) { - struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num]; - struct scrub_block *sblock_other = NULL; - - /* Skip no-io-error sectors in scrub */ - if (!sector_bad->io_error && !sctx->is_dev_replace) - continue; - - if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) { - /* - * In case of dev replace, if raid56 rebuild process - * didn't work out correct data, then copy the content - * in sblock_bad to make sure target device is identical - * to source device, instead of writing garbage data in - * sblock_for_recheck array to target device. - */ - sblock_other = NULL; - } else if (sector_bad->io_error) { - /* Try to find no-io-error sector in mirrors */ - for (mirror_index = 0; - mirror_index < BTRFS_MAX_MIRRORS && - sblocks_for_recheck[mirror_index]->sector_count > 0; - mirror_index++) { - if (!sblocks_for_recheck[mirror_index]-> - sectors[sector_num]->io_error) { - sblock_other = sblocks_for_recheck[mirror_index]; - break; - } - } - if (!sblock_other) - success = 0; - } - - if (sctx->is_dev_replace) { - /* - * Did not find a mirror to fetch the sector from. - * scrub_write_sector_to_dev_replace() handles this - * case (sector->io_error), by filling the block with - * zeros before submitting the write request - */ - if (!sblock_other) - sblock_other = sblock_bad; - - if (scrub_write_sector_to_dev_replace(sblock_other, - sector_num) != 0) { - atomic64_inc( - &fs_info->dev_replace.num_write_errors); - success = 0; - } - } else if (sblock_other) { - ret = scrub_repair_sector_from_good_copy(sblock_bad, - sblock_other, - sector_num, 0); - if (0 == ret) - sector_bad->io_error = 0; - else - success = 0; - } - } - - if (success && !sctx->is_dev_replace) { - if (is_metadata || have_csum) { - /* - * need to verify the checksum now that all - * sectors on disk are repaired (the write - * request for data to be repaired is on its way). - * Just be lazy and use scrub_recheck_block() - * which re-reads the data before the checksum - * is verified, but most likely the data comes out - * of the page cache. - */ - scrub_recheck_block(fs_info, sblock_bad, 1); - if (!sblock_bad->header_error && - !sblock_bad->checksum_error && - sblock_bad->no_io_error_seen) - goto corrected_error; - else - goto did_not_correct_error; - } else { -corrected_error: - spin_lock(&sctx->stat_lock); - sctx->stat.corrected_errors++; - sblock_to_check->data_corrected = 1; - spin_unlock(&sctx->stat_lock); - btrfs_err_rl_in_rcu(fs_info, - "fixed up error at logical %llu on dev %s", - logical, btrfs_dev_name(dev)); - } - } else { -did_not_correct_error: - spin_lock(&sctx->stat_lock); - sctx->stat.uncorrectable_errors++; - spin_unlock(&sctx->stat_lock); - btrfs_err_rl_in_rcu(fs_info, - "unable to fixup (regular) error at logical %llu on dev %s", - logical, btrfs_dev_name(dev)); - } - -out: - for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) { - struct scrub_block *sblock = sblocks_for_recheck[mirror_index]; - struct scrub_recover *recover; - int sector_index; - - /* Not allocated, continue checking the next mirror */ - if (!sblock) - continue; - - for (sector_index = 0; sector_index < sblock->sector_count; - sector_index++) { - /* - * Here we just cleanup the recover, each sector will be - * properly cleaned up by later scrub_block_put() - */ - recover = sblock->sectors[sector_index]->recover; - if (recover) { - scrub_put_recover(fs_info, recover); - sblock->sectors[sector_index]->recover = NULL; - } - } - scrub_block_put(sblock); - } - - ret = unlock_full_stripe(fs_info, logical, full_stripe_locked); - memalloc_nofs_restore(nofs_flag); - if (ret < 0) - return ret; - return 0; -} - static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc) { if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5) @@ -1430,7 +551,7 @@ static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc) } static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, - u64 *raid_map, + u64 full_stripe_logical, int nstripes, int mirror, int *stripe_index, u64 *stripe_offset) @@ -1438,19 +559,22 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, int i; if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + const int nr_data_stripes = (map_type & BTRFS_BLOCK_GROUP_RAID5) ? + nstripes - 1 : nstripes - 2; + /* RAID5/6 */ - for (i = 0; i < nstripes; i++) { - if (raid_map[i] == RAID6_Q_STRIPE || - raid_map[i] == RAID5_P_STRIPE) - continue; + for (i = 0; i < nr_data_stripes; i++) { + const u64 data_stripe_start = full_stripe_logical + + (i * BTRFS_STRIPE_LEN); - if (logical >= raid_map[i] && - logical < raid_map[i] + BTRFS_STRIPE_LEN) + if (logical >= data_stripe_start && + logical < data_stripe_start + BTRFS_STRIPE_LEN) break; } *stripe_index = i; - *stripe_offset = logical - raid_map[i]; + *stripe_offset = (logical - full_stripe_logical) & + BTRFS_STRIPE_LEN_MASK; } else { /* The other RAID type */ *stripe_index = mirror; @@ -1458,336 +582,6 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, } } -static int scrub_setup_recheck_block(struct scrub_block *original_sblock, - struct scrub_block *sblocks_for_recheck[]) -{ - struct scrub_ctx *sctx = original_sblock->sctx; - struct btrfs_fs_info *fs_info = sctx->fs_info; - u64 logical = original_sblock->logical; - u64 length = original_sblock->sector_count << fs_info->sectorsize_bits; - u64 generation = original_sblock->sectors[0]->generation; - u64 flags = original_sblock->sectors[0]->flags; - u64 have_csum = original_sblock->sectors[0]->have_csum; - struct scrub_recover *recover; - struct btrfs_io_context *bioc; - u64 sublen; - u64 mapped_length; - u64 stripe_offset; - int stripe_index; - int sector_index = 0; - int mirror_index; - int nmirrors; - int ret; - - while (length > 0) { - sublen = min_t(u64, length, fs_info->sectorsize); - mapped_length = sublen; - bioc = NULL; - - /* - * With a length of sectorsize, each returned stripe represents - * one mirror - */ - btrfs_bio_counter_inc_blocked(fs_info); - ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, - logical, &mapped_length, &bioc); - if (ret || !bioc || mapped_length < sublen) { - btrfs_put_bioc(bioc); - btrfs_bio_counter_dec(fs_info); - return -EIO; - } - - recover = kzalloc(sizeof(struct scrub_recover), GFP_KERNEL); - if (!recover) { - btrfs_put_bioc(bioc); - btrfs_bio_counter_dec(fs_info); - return -ENOMEM; - } - - refcount_set(&recover->refs, 1); - recover->bioc = bioc; - recover->map_length = mapped_length; - - ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK); - - nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS); - - for (mirror_index = 0; mirror_index < nmirrors; - mirror_index++) { - struct scrub_block *sblock; - struct scrub_sector *sector; - - sblock = sblocks_for_recheck[mirror_index]; - sblock->sctx = sctx; - - sector = alloc_scrub_sector(sblock, logical); - if (!sector) { - spin_lock(&sctx->stat_lock); - sctx->stat.malloc_errors++; - spin_unlock(&sctx->stat_lock); - scrub_put_recover(fs_info, recover); - return -ENOMEM; - } - sector->flags = flags; - sector->generation = generation; - sector->have_csum = have_csum; - if (have_csum) - memcpy(sector->csum, - original_sblock->sectors[0]->csum, - sctx->fs_info->csum_size); - - scrub_stripe_index_and_offset(logical, - bioc->map_type, - bioc->raid_map, - bioc->num_stripes - - bioc->num_tgtdevs, - mirror_index, - &stripe_index, - &stripe_offset); - /* - * We're at the first sector, also populate @sblock - * physical and dev. - */ - if (sector_index == 0) { - sblock->physical = - bioc->stripes[stripe_index].physical + - stripe_offset; - sblock->dev = bioc->stripes[stripe_index].dev; - sblock->physical_for_dev_replace = - original_sblock->physical_for_dev_replace; - } - - BUG_ON(sector_index >= original_sblock->sector_count); - scrub_get_recover(recover); - sector->recover = recover; - } - scrub_put_recover(fs_info, recover); - length -= sublen; - logical += sublen; - sector_index++; - } - - return 0; -} - -static void scrub_bio_wait_endio(struct bio *bio) -{ - complete(bio->bi_private); -} - -static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, - struct bio *bio, - struct scrub_sector *sector) -{ - DECLARE_COMPLETION_ONSTACK(done); - - bio->bi_iter.bi_sector = (sector->offset + sector->sblock->logical) >> - SECTOR_SHIFT; - bio->bi_private = &done; - bio->bi_end_io = scrub_bio_wait_endio; - raid56_parity_recover(bio, sector->recover->bioc, sector->sblock->mirror_num); - - wait_for_completion_io(&done); - return blk_status_to_errno(bio->bi_status); -} - -static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info, - struct scrub_block *sblock) -{ - struct scrub_sector *first_sector = sblock->sectors[0]; - struct bio *bio; - int i; - - /* All sectors in sblock belong to the same stripe on the same device. */ - ASSERT(sblock->dev); - if (!sblock->dev->bdev) - goto out; - - bio = bio_alloc(sblock->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); - - for (i = 0; i < sblock->sector_count; i++) { - struct scrub_sector *sector = sblock->sectors[i]; - - bio_add_scrub_sector(bio, sector, fs_info->sectorsize); - } - - if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) { - bio_put(bio); - goto out; - } - - bio_put(bio); - - scrub_recheck_block_checksum(sblock); - - return; -out: - for (i = 0; i < sblock->sector_count; i++) - sblock->sectors[i]->io_error = 1; - - sblock->no_io_error_seen = 0; -} - -/* - * This function will check the on disk data for checksum errors, header errors - * and read I/O errors. If any I/O errors happen, the exact sectors which are - * errored are marked as being bad. The goal is to enable scrub to take those - * sectors that are not errored from all the mirrors so that the sectors that - * are errored in the just handled mirror can be repaired. - */ -static void scrub_recheck_block(struct btrfs_fs_info *fs_info, - struct scrub_block *sblock, - int retry_failed_mirror) -{ - int i; - - sblock->no_io_error_seen = 1; - - /* short cut for raid56 */ - if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0])) - return scrub_recheck_block_on_raid56(fs_info, sblock); - - for (i = 0; i < sblock->sector_count; i++) { - struct scrub_sector *sector = sblock->sectors[i]; - struct bio bio; - struct bio_vec bvec; - - if (sblock->dev->bdev == NULL) { - sector->io_error = 1; - sblock->no_io_error_seen = 0; - continue; - } - - bio_init(&bio, sblock->dev->bdev, &bvec, 1, REQ_OP_READ); - bio_add_scrub_sector(&bio, sector, fs_info->sectorsize); - bio.bi_iter.bi_sector = (sblock->physical + sector->offset) >> - SECTOR_SHIFT; - - btrfsic_check_bio(&bio); - if (submit_bio_wait(&bio)) { - sector->io_error = 1; - sblock->no_io_error_seen = 0; - } - - bio_uninit(&bio); - } - - if (sblock->no_io_error_seen) - scrub_recheck_block_checksum(sblock); -} - -static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector) -{ - struct btrfs_fs_devices *fs_devices = sector->sblock->dev->fs_devices; - int ret; - - ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE); - return !ret; -} - -static void scrub_recheck_block_checksum(struct scrub_block *sblock) -{ - sblock->header_error = 0; - sblock->checksum_error = 0; - sblock->generation_error = 0; - - if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA) - scrub_checksum_data(sblock); - else - scrub_checksum_tree_block(sblock); -} - -static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, - struct scrub_block *sblock_good) -{ - int i; - int ret = 0; - - for (i = 0; i < sblock_bad->sector_count; i++) { - int ret_sub; - - ret_sub = scrub_repair_sector_from_good_copy(sblock_bad, - sblock_good, i, 1); - if (ret_sub) - ret = ret_sub; - } - - return ret; -} - -static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad, - struct scrub_block *sblock_good, - int sector_num, int force_write) -{ - struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num]; - struct scrub_sector *sector_good = sblock_good->sectors[sector_num]; - struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info; - const u32 sectorsize = fs_info->sectorsize; - - if (force_write || sblock_bad->header_error || - sblock_bad->checksum_error || sector_bad->io_error) { - struct bio bio; - struct bio_vec bvec; - int ret; - - if (!sblock_bad->dev->bdev) { - btrfs_warn_rl(fs_info, - "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected"); - return -EIO; - } - - bio_init(&bio, sblock_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE); - bio.bi_iter.bi_sector = (sblock_bad->physical + - sector_bad->offset) >> SECTOR_SHIFT; - ret = bio_add_scrub_sector(&bio, sector_good, sectorsize); - - btrfsic_check_bio(&bio); - ret = submit_bio_wait(&bio); - bio_uninit(&bio); - - if (ret) { - btrfs_dev_stat_inc_and_print(sblock_bad->dev, - BTRFS_DEV_STAT_WRITE_ERRS); - atomic64_inc(&fs_info->dev_replace.num_write_errors); - return -EIO; - } - } - - return 0; -} - -static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) -{ - struct btrfs_fs_info *fs_info = sblock->sctx->fs_info; - int i; - - /* - * This block is used for the check of the parity on the source device, - * so the data needn't be written into the destination device. - */ - if (sblock->sparity) - return; - - for (i = 0; i < sblock->sector_count; i++) { - int ret; - - ret = scrub_write_sector_to_dev_replace(sblock, i); - if (ret) - atomic64_inc(&fs_info->dev_replace.num_write_errors); - } -} - -static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num) -{ - const u32 sectorsize = sblock->sctx->fs_info->sectorsize; - struct scrub_sector *sector = sblock->sectors[sector_num]; - - if (sector->io_error) - memset(scrub_sector_get_kaddr(sector), 0, sectorsize); - - return scrub_add_sector_to_wr_bio(sblock->sctx, sector); -} - static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) { int ret = 0; @@ -1810,1089 +604,653 @@ static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) return ret; } -static void scrub_block_get(struct scrub_block *sblock) +static struct page *scrub_stripe_get_page(struct scrub_stripe *stripe, int sector_nr) { - refcount_inc(&sblock->refs); -} - -static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx, - struct scrub_sector *sector) -{ - struct scrub_block *sblock = sector->sblock; - struct scrub_bio *sbio; - int ret; - const u32 sectorsize = sctx->fs_info->sectorsize; - - mutex_lock(&sctx->wr_lock); -again: - if (!sctx->wr_curr_bio) { - sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio), - GFP_KERNEL); - if (!sctx->wr_curr_bio) { - mutex_unlock(&sctx->wr_lock); - return -ENOMEM; - } - sctx->wr_curr_bio->sctx = sctx; - sctx->wr_curr_bio->sector_count = 0; - } - sbio = sctx->wr_curr_bio; - if (sbio->sector_count == 0) { - ret = fill_writer_pointer_gap(sctx, sector->offset + - sblock->physical_for_dev_replace); - if (ret) { - mutex_unlock(&sctx->wr_lock); - return ret; - } - - sbio->physical = sblock->physical_for_dev_replace + sector->offset; - sbio->logical = sblock->logical + sector->offset; - sbio->dev = sctx->wr_tgtdev; - if (!sbio->bio) { - sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio, - REQ_OP_WRITE, GFP_NOFS); - } - sbio->bio->bi_private = sbio; - sbio->bio->bi_end_io = scrub_wr_bio_end_io; - sbio->bio->bi_iter.bi_sector = sbio->physical >> 9; - sbio->status = 0; - } else if (sbio->physical + sbio->sector_count * sectorsize != - sblock->physical_for_dev_replace + sector->offset || - sbio->logical + sbio->sector_count * sectorsize != - sblock->logical + sector->offset) { - scrub_wr_submit(sctx); - goto again; - } + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + int page_index = (sector_nr << fs_info->sectorsize_bits) >> PAGE_SHIFT; - ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize); - if (ret != sectorsize) { - if (sbio->sector_count < 1) { - bio_put(sbio->bio); - sbio->bio = NULL; - mutex_unlock(&sctx->wr_lock); - return -EIO; - } - scrub_wr_submit(sctx); - goto again; - } - - sbio->sectors[sbio->sector_count] = sector; - scrub_sector_get(sector); - /* - * Since ssector no longer holds a page, but uses sblock::pages, we - * have to ensure the sblock had not been freed before our write bio - * finished. - */ - scrub_block_get(sector->sblock); - - sbio->sector_count++; - if (sbio->sector_count == sctx->sectors_per_bio) - scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_lock); - - return 0; + return stripe->pages[page_index]; } -static void scrub_wr_submit(struct scrub_ctx *sctx) +static unsigned int scrub_stripe_get_page_offset(struct scrub_stripe *stripe, + int sector_nr) { - struct scrub_bio *sbio; - - if (!sctx->wr_curr_bio) - return; - - sbio = sctx->wr_curr_bio; - sctx->wr_curr_bio = NULL; - scrub_pending_bio_inc(sctx); - /* process all writes in a single worker thread. Then the block layer - * orders the requests before sending them to the driver which - * doubled the write performance on spinning disks when measured - * with Linux 3.5 */ - btrfsic_check_bio(sbio->bio); - submit_bio(sbio->bio); - - if (btrfs_is_zoned(sctx->fs_info)) - sctx->write_pointer = sbio->physical + sbio->sector_count * - sctx->fs_info->sectorsize; -} - -static void scrub_wr_bio_end_io(struct bio *bio) -{ - struct scrub_bio *sbio = bio->bi_private; - struct btrfs_fs_info *fs_info = sbio->dev->fs_info; - - sbio->status = bio->bi_status; - sbio->bio = bio; + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; - INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker); - queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); + return offset_in_page(sector_nr << fs_info->sectorsize_bits); } -static void scrub_wr_bio_end_io_worker(struct work_struct *work) +static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr) { - struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); - struct scrub_ctx *sctx = sbio->sctx; - int i; - - ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO); - if (sbio->status) { - struct btrfs_dev_replace *dev_replace = - &sbio->sctx->fs_info->dev_replace; - - for (i = 0; i < sbio->sector_count; i++) { - struct scrub_sector *sector = sbio->sectors[i]; - - sector->io_error = 1; - atomic64_inc(&dev_replace->num_write_errors); - } - } - - /* - * In scrub_add_sector_to_wr_bio() we grab extra ref for sblock, now in - * endio we should put the sblock. - */ - for (i = 0; i < sbio->sector_count; i++) { - scrub_block_put(sbio->sectors[i]->sblock); - scrub_sector_put(sbio->sectors[i]); - } - - bio_put(sbio->bio); - kfree(sbio); - scrub_pending_bio_dec(sctx); -} - -static int scrub_checksum(struct scrub_block *sblock) -{ - u64 flags; - int ret; - - /* - * No need to initialize these stats currently, - * because this function only use return value - * instead of these stats value. - * - * Todo: - * always use stats - */ - sblock->header_error = 0; - sblock->generation_error = 0; - sblock->checksum_error = 0; - - WARN_ON(sblock->sector_count < 1); - flags = sblock->sectors[0]->flags; - ret = 0; - if (flags & BTRFS_EXTENT_FLAG_DATA) - ret = scrub_checksum_data(sblock); - else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) - ret = scrub_checksum_tree_block(sblock); - else if (flags & BTRFS_EXTENT_FLAG_SUPER) - ret = scrub_checksum_super(sblock); - else - WARN_ON(1); - if (ret) - scrub_handle_errored_block(sblock); - - return ret; -} - -static int scrub_checksum_data(struct scrub_block *sblock) -{ - struct scrub_ctx *sctx = sblock->sctx; - struct btrfs_fs_info *fs_info = sctx->fs_info; + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; + const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits); + const struct page *first_page = scrub_stripe_get_page(stripe, sector_nr); + const unsigned int first_off = scrub_stripe_get_page_offset(stripe, sector_nr); SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - u8 csum[BTRFS_CSUM_SIZE]; - struct scrub_sector *sector; - char *kaddr; - - BUG_ON(sblock->sector_count < 1); - sector = sblock->sectors[0]; - if (!sector->have_csum) - return 0; - - kaddr = scrub_sector_get_kaddr(sector); - - shash->tfm = fs_info->csum_shash; - crypto_shash_init(shash); - - crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); - - if (memcmp(csum, sector->csum, fs_info->csum_size)) - sblock->checksum_error = 1; - return sblock->checksum_error; -} - -static int scrub_checksum_tree_block(struct scrub_block *sblock) -{ - struct scrub_ctx *sctx = sblock->sctx; - struct btrfs_header *h; - struct btrfs_fs_info *fs_info = sctx->fs_info; - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; - /* - * This is done in sectorsize steps even for metadata as there's a - * constraint for nodesize to be aligned to sectorsize. This will need - * to change so we don't misuse data and metadata units like that. - */ - const u32 sectorsize = sctx->fs_info->sectorsize; - const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits; - int i; - struct scrub_sector *sector; - char *kaddr; - - BUG_ON(sblock->sector_count < 1); - - /* Each member in sectors is just one sector */ - ASSERT(sblock->sector_count == num_sectors); - - sector = sblock->sectors[0]; - kaddr = scrub_sector_get_kaddr(sector); - h = (struct btrfs_header *)kaddr; - memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size); + u8 calculated_csum[BTRFS_CSUM_SIZE]; + struct btrfs_header *header; /* - * we don't use the getter functions here, as we - * a) don't have an extent buffer and - * b) the page is already kmapped + * Here we don't have a good way to attach the pages (and subpages) + * to a dummy extent buffer, thus we have to directly grab the members + * from pages. */ - if (sblock->logical != btrfs_stack_header_bytenr(h)) { - sblock->header_error = 1; + header = (struct btrfs_header *)(page_address(first_page) + first_off); + memcpy(on_disk_csum, header->csum, fs_info->csum_size); + + if (logical != btrfs_stack_header_bytenr(header)) { + bitmap_set(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree); + bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad bytenr, has %llu want %llu", - sblock->logical, sblock->mirror_num, - btrfs_stack_header_bytenr(h), - sblock->logical); - goto out; + logical, stripe->mirror_num, + btrfs_stack_header_bytenr(header), logical); + return; } - - if (!scrub_check_fsid(h->fsid, sector)) { - sblock->header_error = 1; + if (memcmp(header->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE) != 0) { + bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); + bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad fsid, has %pU want %pU", - sblock->logical, sblock->mirror_num, - h->fsid, sblock->dev->fs_devices->fsid); - goto out; + logical, stripe->mirror_num, + header->fsid, fs_info->fs_devices->fsid); + return; } - - if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) { - sblock->header_error = 1; + if (memcmp(header->chunk_tree_uuid, fs_info->chunk_tree_uuid, + BTRFS_UUID_SIZE) != 0) { + bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); + bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", - sblock->logical, sblock->mirror_num, - h->chunk_tree_uuid, fs_info->chunk_tree_uuid); - goto out; + logical, stripe->mirror_num, + header->chunk_tree_uuid, fs_info->chunk_tree_uuid); + return; } + /* Now check tree block csum. */ shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); - crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE, - sectorsize - BTRFS_CSUM_SIZE); + crypto_shash_update(shash, page_address(first_page) + first_off + + BTRFS_CSUM_SIZE, fs_info->sectorsize - BTRFS_CSUM_SIZE); + + for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) { + struct page *page = scrub_stripe_get_page(stripe, i); + unsigned int page_off = scrub_stripe_get_page_offset(stripe, i); - for (i = 1; i < num_sectors; i++) { - kaddr = scrub_sector_get_kaddr(sblock->sectors[i]); - crypto_shash_update(shash, kaddr, sectorsize); + crypto_shash_update(shash, page_address(page) + page_off, + fs_info->sectorsize); } crypto_shash_final(shash, calculated_csum); - if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) { - sblock->checksum_error = 1; + if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) { + bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); + bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT, - sblock->logical, sblock->mirror_num, + logical, stripe->mirror_num, CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum), CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum)); - goto out; + return; } - - if (sector->generation != btrfs_stack_header_generation(h)) { - sblock->header_error = 1; - sblock->generation_error = 1; + if (stripe->sectors[sector_nr].generation != + btrfs_stack_header_generation(header)) { + bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); + bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad generation, has %llu want %llu", - sblock->logical, sblock->mirror_num, - btrfs_stack_header_generation(h), - sector->generation); + logical, stripe->mirror_num, + btrfs_stack_header_generation(header), + stripe->sectors[sector_nr].generation); + return; } - -out: - return sblock->header_error || sblock->checksum_error; + bitmap_clear(&stripe->error_bitmap, sector_nr, sectors_per_tree); + bitmap_clear(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree); + bitmap_clear(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); } -static int scrub_checksum_super(struct scrub_block *sblock) +static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) { - struct btrfs_super_block *s; - struct scrub_ctx *sctx = sblock->sctx; - struct btrfs_fs_info *fs_info = sctx->fs_info; - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - u8 calculated_csum[BTRFS_CSUM_SIZE]; - struct scrub_sector *sector; - char *kaddr; - int fail_gen = 0; - int fail_cor = 0; - - BUG_ON(sblock->sector_count < 1); - sector = sblock->sectors[0]; - kaddr = scrub_sector_get_kaddr(sector); - s = (struct btrfs_super_block *)kaddr; - - if (sblock->logical != btrfs_super_bytenr(s)) - ++fail_cor; - - if (sector->generation != btrfs_super_generation(s)) - ++fail_gen; - - if (!scrub_check_fsid(s->fsid, sector)) - ++fail_cor; + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + struct scrub_sector_verification *sector = &stripe->sectors[sector_nr]; + const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; + struct page *page = scrub_stripe_get_page(stripe, sector_nr); + unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr); + u8 csum_buf[BTRFS_CSUM_SIZE]; + int ret; - shash->tfm = fs_info->csum_shash; - crypto_shash_init(shash); - crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE, - BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum); + ASSERT(sector_nr >= 0 && sector_nr < stripe->nr_sectors); - if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size)) - ++fail_cor; + /* Sector not utilized, skip it. */ + if (!test_bit(sector_nr, &stripe->extent_sector_bitmap)) + return; - return fail_cor + fail_gen; -} + /* IO error, no need to check. */ + if (test_bit(sector_nr, &stripe->io_error_bitmap)) + return; -static void scrub_block_put(struct scrub_block *sblock) -{ - if (refcount_dec_and_test(&sblock->refs)) { - int i; - - if (sblock->sparity) - scrub_parity_put(sblock->sparity); - - for (i = 0; i < sblock->sector_count; i++) - scrub_sector_put(sblock->sectors[i]); - for (i = 0; i < DIV_ROUND_UP(sblock->len, PAGE_SIZE); i++) { - if (sblock->pages[i]) { - detach_scrub_page_private(sblock->pages[i]); - __free_page(sblock->pages[i]); - } + /* Metadata, verify the full tree block. */ + if (sector->is_metadata) { + /* + * Check if the tree block crosses the stripe boudary. If + * crossed the boundary, we cannot verify it but only give a + * warning. + * + * This can only happen on a very old filesystem where chunks + * are not ensured to be stripe aligned. + */ + if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) { + btrfs_warn_rl(fs_info, + "tree block at %llu crosses stripe boundary %llu", + stripe->logical + + (sector_nr << fs_info->sectorsize_bits), + stripe->logical); + return; } - kfree(sblock); - } -} - -static void scrub_sector_get(struct scrub_sector *sector) -{ - atomic_inc(§or->refs); -} - -static void scrub_sector_put(struct scrub_sector *sector) -{ - if (atomic_dec_and_test(§or->refs)) - kfree(sector); -} - -/* - * Throttling of IO submission, bandwidth-limit based, the timeslice is 1 - * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max. - */ -static void scrub_throttle(struct scrub_ctx *sctx) -{ - const int time_slice = 1000; - struct scrub_bio *sbio; - struct btrfs_device *device; - s64 delta; - ktime_t now; - u32 div; - u64 bwlimit; - - sbio = sctx->bios[sctx->curr]; - device = sbio->dev; - bwlimit = READ_ONCE(device->scrub_speed_max); - if (bwlimit == 0) + scrub_verify_one_metadata(stripe, sector_nr); return; + } /* - * Slice is divided into intervals when the IO is submitted, adjust by - * bwlimit and maximum of 64 intervals. + * Data is easier, we just verify the data csum (if we have it). For + * cases without csum, we have no other choice but to trust it. */ - div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024))); - div = min_t(u32, 64, div); - - /* Start new epoch, set deadline */ - now = ktime_get(); - if (sctx->throttle_deadline == 0) { - sctx->throttle_deadline = ktime_add_ms(now, time_slice / div); - sctx->throttle_sent = 0; + if (!sector->csum) { + clear_bit(sector_nr, &stripe->error_bitmap); + return; } - /* Still in the time to send? */ - if (ktime_before(now, sctx->throttle_deadline)) { - /* If current bio is within the limit, send it */ - sctx->throttle_sent += sbio->bio->bi_iter.bi_size; - if (sctx->throttle_sent <= div_u64(bwlimit, div)) - return; - - /* We're over the limit, sleep until the rest of the slice */ - delta = ktime_ms_delta(sctx->throttle_deadline, now); + ret = btrfs_check_sector_csum(fs_info, page, pgoff, csum_buf, sector->csum); + if (ret < 0) { + set_bit(sector_nr, &stripe->csum_error_bitmap); + set_bit(sector_nr, &stripe->error_bitmap); } else { - /* New request after deadline, start new epoch */ - delta = 0; - } - - if (delta) { - long timeout; - - timeout = div_u64(delta * HZ, 1000); - schedule_timeout_interruptible(timeout); + clear_bit(sector_nr, &stripe->csum_error_bitmap); + clear_bit(sector_nr, &stripe->error_bitmap); } - - /* Next call will start the deadline period */ - sctx->throttle_deadline = 0; -} - -static void scrub_submit(struct scrub_ctx *sctx) -{ - struct scrub_bio *sbio; - - if (sctx->curr == -1) - return; - - scrub_throttle(sctx); - - sbio = sctx->bios[sctx->curr]; - sctx->curr = -1; - scrub_pending_bio_inc(sctx); - btrfsic_check_bio(sbio->bio); - submit_bio(sbio->bio); } -static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx, - struct scrub_sector *sector) +/* Verify specified sectors of a stripe. */ +static void scrub_verify_one_stripe(struct scrub_stripe *stripe, unsigned long bitmap) { - struct scrub_block *sblock = sector->sblock; - struct scrub_bio *sbio; - const u32 sectorsize = sctx->fs_info->sectorsize; - int ret; + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; + int sector_nr; -again: - /* - * grab a fresh bio or wait for one to become available - */ - while (sctx->curr == -1) { - spin_lock(&sctx->list_lock); - sctx->curr = sctx->first_free; - if (sctx->curr != -1) { - sctx->first_free = sctx->bios[sctx->curr]->next_free; - sctx->bios[sctx->curr]->next_free = -1; - sctx->bios[sctx->curr]->sector_count = 0; - spin_unlock(&sctx->list_lock); - } else { - spin_unlock(&sctx->list_lock); - wait_event(sctx->list_wait, sctx->first_free != -1); - } + for_each_set_bit(sector_nr, &bitmap, stripe->nr_sectors) { + scrub_verify_one_sector(stripe, sector_nr); + if (stripe->sectors[sector_nr].is_metadata) + sector_nr += sectors_per_tree - 1; } - sbio = sctx->bios[sctx->curr]; - if (sbio->sector_count == 0) { - sbio->physical = sblock->physical + sector->offset; - sbio->logical = sblock->logical + sector->offset; - sbio->dev = sblock->dev; - if (!sbio->bio) { - sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio, - REQ_OP_READ, GFP_NOFS); - } - sbio->bio->bi_private = sbio; - sbio->bio->bi_end_io = scrub_bio_end_io; - sbio->bio->bi_iter.bi_sector = sbio->physical >> 9; - sbio->status = 0; - } else if (sbio->physical + sbio->sector_count * sectorsize != - sblock->physical + sector->offset || - sbio->logical + sbio->sector_count * sectorsize != - sblock->logical + sector->offset || - sbio->dev != sblock->dev) { - scrub_submit(sctx); - goto again; - } - - sbio->sectors[sbio->sector_count] = sector; - ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize); - if (ret != sectorsize) { - if (sbio->sector_count < 1) { - bio_put(sbio->bio); - sbio->bio = NULL; - return -EIO; - } - scrub_submit(sctx); - goto again; - } - - scrub_block_get(sblock); /* one for the page added to the bio */ - atomic_inc(&sblock->outstanding_sectors); - sbio->sector_count++; - if (sbio->sector_count == sctx->sectors_per_bio) - scrub_submit(sctx); - - return 0; } -static void scrub_missing_raid56_end_io(struct bio *bio) +static int calc_sector_number(struct scrub_stripe *stripe, struct bio_vec *first_bvec) { - struct scrub_block *sblock = bio->bi_private; - struct btrfs_fs_info *fs_info = sblock->sctx->fs_info; - - btrfs_bio_counter_dec(fs_info); - if (bio->bi_status) - sblock->no_io_error_seen = 0; - - bio_put(bio); + int i; - queue_work(fs_info->scrub_workers, &sblock->work); + for (i = 0; i < stripe->nr_sectors; i++) { + if (scrub_stripe_get_page(stripe, i) == first_bvec->bv_page && + scrub_stripe_get_page_offset(stripe, i) == first_bvec->bv_offset) + break; + } + ASSERT(i < stripe->nr_sectors); + return i; } -static void scrub_missing_raid56_worker(struct work_struct *work) +/* + * Repair read is different to the regular read: + * + * - Only reads the failed sectors + * - May have extra blocksize limits + */ +static void scrub_repair_read_endio(struct btrfs_bio *bbio) { - struct scrub_block *sblock = container_of(work, struct scrub_block, work); - struct scrub_ctx *sctx = sblock->sctx; - struct btrfs_fs_info *fs_info = sctx->fs_info; - u64 logical; - struct btrfs_device *dev; + struct scrub_stripe *stripe = bbio->private; + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + struct bio_vec *bvec; + int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); + u32 bio_size = 0; + int i; - logical = sblock->logical; - dev = sblock->dev; + ASSERT(sector_nr < stripe->nr_sectors); - if (sblock->no_io_error_seen) - scrub_recheck_block_checksum(sblock); + bio_for_each_bvec_all(bvec, &bbio->bio, i) + bio_size += bvec->bv_len; - if (!sblock->no_io_error_seen) { - spin_lock(&sctx->stat_lock); - sctx->stat.read_errors++; - spin_unlock(&sctx->stat_lock); - btrfs_err_rl_in_rcu(fs_info, - "IO error rebuilding logical %llu for dev %s", - logical, btrfs_dev_name(dev)); - } else if (sblock->header_error || sblock->checksum_error) { - spin_lock(&sctx->stat_lock); - sctx->stat.uncorrectable_errors++; - spin_unlock(&sctx->stat_lock); - btrfs_err_rl_in_rcu(fs_info, - "failed to rebuild valid logical %llu for dev %s", - logical, btrfs_dev_name(dev)); + if (bbio->bio.bi_status) { + bitmap_set(&stripe->io_error_bitmap, sector_nr, + bio_size >> fs_info->sectorsize_bits); + bitmap_set(&stripe->error_bitmap, sector_nr, + bio_size >> fs_info->sectorsize_bits); } else { - scrub_write_block_to_dev_replace(sblock); - } - - if (sctx->is_dev_replace && sctx->flush_all_writes) { - mutex_lock(&sctx->wr_lock); - scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_lock); + bitmap_clear(&stripe->io_error_bitmap, sector_nr, + bio_size >> fs_info->sectorsize_bits); } + bio_put(&bbio->bio); + if (atomic_dec_and_test(&stripe->pending_io)) + wake_up(&stripe->io_wait); +} - scrub_block_put(sblock); - scrub_pending_bio_dec(sctx); +static int calc_next_mirror(int mirror, int num_copies) +{ + ASSERT(mirror <= num_copies); + return (mirror + 1 > num_copies) ? 1 : mirror + 1; } -static void scrub_missing_raid56_pages(struct scrub_block *sblock) +static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, + int mirror, int blocksize, bool wait) { - struct scrub_ctx *sctx = sblock->sctx; - struct btrfs_fs_info *fs_info = sctx->fs_info; - u64 length = sblock->sector_count << fs_info->sectorsize_bits; - u64 logical = sblock->logical; - struct btrfs_io_context *bioc = NULL; - struct bio *bio; - struct btrfs_raid_bio *rbio; - int ret; + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + struct btrfs_bio *bbio = NULL; + const unsigned long old_error_bitmap = stripe->error_bitmap; int i; - btrfs_bio_counter_inc_blocked(fs_info); - ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, - &length, &bioc); - if (ret || !bioc || !bioc->raid_map) - goto bioc_out; - - if (WARN_ON(!sctx->is_dev_replace || - !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) { - /* - * We shouldn't be scrubbing a missing device. Even for dev - * replace, we should only get here for RAID 5/6. We either - * managed to mount something with no mirrors remaining or - * there's a bug in scrub_find_good_copy()/btrfs_map_block(). - */ - goto bioc_out; - } + ASSERT(stripe->mirror_num >= 1); + ASSERT(atomic_read(&stripe->pending_io) == 0); - bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); - bio->bi_iter.bi_sector = logical >> 9; - bio->bi_private = sblock; - bio->bi_end_io = scrub_missing_raid56_end_io; + for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) { + struct page *page; + int pgoff; + int ret; - rbio = raid56_alloc_missing_rbio(bio, bioc); - if (!rbio) - goto rbio_out; + page = scrub_stripe_get_page(stripe, i); + pgoff = scrub_stripe_get_page_offset(stripe, i); + + /* The current sector cannot be merged, submit the bio. */ + if (bbio && ((i > 0 && !test_bit(i - 1, &stripe->error_bitmap)) || + bbio->bio.bi_iter.bi_size >= blocksize)) { + ASSERT(bbio->bio.bi_iter.bi_size); + atomic_inc(&stripe->pending_io); + btrfs_submit_bio(bbio, mirror); + if (wait) + wait_scrub_stripe_io(stripe); + bbio = NULL; + } - for (i = 0; i < sblock->sector_count; i++) { - struct scrub_sector *sector = sblock->sectors[i]; + if (!bbio) { + bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ, + fs_info, scrub_repair_read_endio, stripe); + bbio->bio.bi_iter.bi_sector = (stripe->logical + + (i << fs_info->sectorsize_bits)) >> SECTOR_SHIFT; + } - raid56_add_scrub_pages(rbio, scrub_sector_get_page(sector), - scrub_sector_get_page_offset(sector), - sector->offset + sector->sblock->logical); + ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); + ASSERT(ret == fs_info->sectorsize); + } + if (bbio) { + ASSERT(bbio->bio.bi_iter.bi_size); + atomic_inc(&stripe->pending_io); + btrfs_submit_bio(bbio, mirror); + if (wait) + wait_scrub_stripe_io(stripe); } - - INIT_WORK(&sblock->work, scrub_missing_raid56_worker); - scrub_block_get(sblock); - scrub_pending_bio_inc(sctx); - raid56_submit_missing_rbio(rbio); - btrfs_put_bioc(bioc); - return; - -rbio_out: - bio_put(bio); -bioc_out: - btrfs_bio_counter_dec(fs_info); - btrfs_put_bioc(bioc); - spin_lock(&sctx->stat_lock); - sctx->stat.malloc_errors++; - spin_unlock(&sctx->stat_lock); } -static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len, - u64 physical, struct btrfs_device *dev, u64 flags, - u64 gen, int mirror_num, u8 *csum, - u64 physical_for_dev_replace) +static void scrub_stripe_report_errors(struct scrub_ctx *sctx, + struct scrub_stripe *stripe) { - struct scrub_block *sblock; - const u32 sectorsize = sctx->fs_info->sectorsize; - int index; + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + struct btrfs_fs_info *fs_info = sctx->fs_info; + struct btrfs_device *dev = NULL; + u64 physical = 0; + int nr_data_sectors = 0; + int nr_meta_sectors = 0; + int nr_nodatacsum_sectors = 0; + int nr_repaired_sectors = 0; + int sector_nr; + + if (test_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state)) + return; - sblock = alloc_scrub_block(sctx, dev, logical, physical, - physical_for_dev_replace, mirror_num); - if (!sblock) { - spin_lock(&sctx->stat_lock); - sctx->stat.malloc_errors++; - spin_unlock(&sctx->stat_lock); - return -ENOMEM; - } + /* + * Init needed infos for error reporting. + * + * Although our scrub_stripe infrastucture is mostly based on btrfs_submit_bio() + * thus no need for dev/physical, error reporting still needs dev and physical. + */ + if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) { + u64 mapped_len = fs_info->sectorsize; + struct btrfs_io_context *bioc = NULL; + int stripe_index = stripe->mirror_num - 1; + int ret; - for (index = 0; len > 0; index++) { - struct scrub_sector *sector; + /* For scrub, our mirror_num should always start at 1. */ + ASSERT(stripe->mirror_num >= 1); + ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, + stripe->logical, &mapped_len, &bioc); /* - * Here we will allocate one page for one sector to scrub. - * This is fine if PAGE_SIZE == sectorsize, but will cost - * more memory for PAGE_SIZE > sectorsize case. + * If we failed, dev will be NULL, and later detailed reports + * will just be skipped. */ - u32 l = min(sectorsize, len); + if (ret < 0) + goto skip; + physical = bioc->stripes[stripe_index].physical; + dev = bioc->stripes[stripe_index].dev; + btrfs_put_bioc(bioc); + } - sector = alloc_scrub_sector(sblock, logical); - if (!sector) { - spin_lock(&sctx->stat_lock); - sctx->stat.malloc_errors++; - spin_unlock(&sctx->stat_lock); - scrub_block_put(sblock); - return -ENOMEM; - } - sector->flags = flags; - sector->generation = gen; - if (csum) { - sector->have_csum = 1; - memcpy(sector->csum, csum, sctx->fs_info->csum_size); +skip: + for_each_set_bit(sector_nr, &stripe->extent_sector_bitmap, stripe->nr_sectors) { + bool repaired = false; + + if (stripe->sectors[sector_nr].is_metadata) { + nr_meta_sectors++; } else { - sector->have_csum = 0; + nr_data_sectors++; + if (!stripe->sectors[sector_nr].csum) + nr_nodatacsum_sectors++; } - len -= l; - logical += l; - physical += l; - physical_for_dev_replace += l; - } - WARN_ON(sblock->sector_count == 0); - if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) { + if (test_bit(sector_nr, &stripe->init_error_bitmap) && + !test_bit(sector_nr, &stripe->error_bitmap)) { + nr_repaired_sectors++; + repaired = true; + } + + /* Good sector from the beginning, nothing need to be done. */ + if (!test_bit(sector_nr, &stripe->init_error_bitmap)) + continue; + /* - * This case should only be hit for RAID 5/6 device replace. See - * the comment in scrub_missing_raid56_pages() for details. + * Report error for the corrupted sectors. If repaired, just + * output the message of repaired message. */ - scrub_missing_raid56_pages(sblock); - } else { - for (index = 0; index < sblock->sector_count; index++) { - struct scrub_sector *sector = sblock->sectors[index]; - int ret; - - ret = scrub_add_sector_to_rd_bio(sctx, sector); - if (ret) { - scrub_block_put(sblock); - return ret; + if (repaired) { + if (dev) { + btrfs_err_rl_in_rcu(fs_info, + "fixed up error at logical %llu on dev %s physical %llu", + stripe->logical, btrfs_dev_name(dev), + physical); + } else { + btrfs_err_rl_in_rcu(fs_info, + "fixed up error at logical %llu on mirror %u", + stripe->logical, stripe->mirror_num); } + continue; } - if (flags & BTRFS_EXTENT_FLAG_SUPER) - scrub_submit(sctx); - } - - /* last one frees, either here or in bio completion for last page */ - scrub_block_put(sblock); - return 0; -} - -static void scrub_bio_end_io(struct bio *bio) -{ - struct scrub_bio *sbio = bio->bi_private; - struct btrfs_fs_info *fs_info = sbio->dev->fs_info; + /* The remaining are all for unrepaired. */ + if (dev) { + btrfs_err_rl_in_rcu(fs_info, + "unable to fixup (regular) error at logical %llu on dev %s physical %llu", + stripe->logical, btrfs_dev_name(dev), + physical); + } else { + btrfs_err_rl_in_rcu(fs_info, + "unable to fixup (regular) error at logical %llu on mirror %u", + stripe->logical, stripe->mirror_num); + } - sbio->status = bio->bi_status; - sbio->bio = bio; + if (test_bit(sector_nr, &stripe->io_error_bitmap)) + if (__ratelimit(&rs) && dev) + scrub_print_common_warning("i/o error", dev, false, + stripe->logical, physical); + if (test_bit(sector_nr, &stripe->csum_error_bitmap)) + if (__ratelimit(&rs) && dev) + scrub_print_common_warning("checksum error", dev, false, + stripe->logical, physical); + if (test_bit(sector_nr, &stripe->meta_error_bitmap)) + if (__ratelimit(&rs) && dev) + scrub_print_common_warning("header error", dev, false, + stripe->logical, physical); + } - queue_work(fs_info->scrub_workers, &sbio->work); + spin_lock(&sctx->stat_lock); + sctx->stat.data_extents_scrubbed += stripe->nr_data_extents; + sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents; + sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits; + sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits; + sctx->stat.no_csum += nr_nodatacsum_sectors; + sctx->stat.read_errors += + bitmap_weight(&stripe->io_error_bitmap, stripe->nr_sectors); + sctx->stat.csum_errors += + bitmap_weight(&stripe->csum_error_bitmap, stripe->nr_sectors); + sctx->stat.verify_errors += + bitmap_weight(&stripe->meta_error_bitmap, stripe->nr_sectors); + sctx->stat.uncorrectable_errors += + bitmap_weight(&stripe->error_bitmap, stripe->nr_sectors); + sctx->stat.corrected_errors += nr_repaired_sectors; + spin_unlock(&sctx->stat_lock); } -static void scrub_bio_end_io_worker(struct work_struct *work) +/* + * The main entrance for all read related scrub work, including: + * + * - Wait for the initial read to finish + * - Verify and locate any bad sectors + * - Go through the remaining mirrors and try to read as large blocksize as + * possible + * - Go through all mirrors (including the failed mirror) sector-by-sector + * + * Writeback does not happen here, it needs extra synchronization. + */ +static void scrub_stripe_read_repair_worker(struct work_struct *work) { - struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); - struct scrub_ctx *sctx = sbio->sctx; + struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work); + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, + stripe->bg->length); + int mirror; int i; - ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO); - if (sbio->status) { - for (i = 0; i < sbio->sector_count; i++) { - struct scrub_sector *sector = sbio->sectors[i]; + ASSERT(stripe->mirror_num > 0); - sector->io_error = 1; - sector->sblock->no_io_error_seen = 0; - } - } + wait_scrub_stripe_io(stripe); + scrub_verify_one_stripe(stripe, stripe->extent_sector_bitmap); + /* Save the initial failed bitmap for later repair and report usage. */ + stripe->init_error_bitmap = stripe->error_bitmap; - /* Now complete the scrub_block items that have all pages completed */ - for (i = 0; i < sbio->sector_count; i++) { - struct scrub_sector *sector = sbio->sectors[i]; - struct scrub_block *sblock = sector->sblock; + if (bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) + goto out; - if (atomic_dec_and_test(&sblock->outstanding_sectors)) - scrub_block_complete(sblock); - scrub_block_put(sblock); + /* + * Try all remaining mirrors. + * + * Here we still try to read as large block as possible, as this is + * faster and we have extra safety nets to rely on. + */ + for (mirror = calc_next_mirror(stripe->mirror_num, num_copies); + mirror != stripe->mirror_num; + mirror = calc_next_mirror(mirror, num_copies)) { + const unsigned long old_error_bitmap = stripe->error_bitmap; + + scrub_stripe_submit_repair_read(stripe, mirror, + BTRFS_STRIPE_LEN, false); + wait_scrub_stripe_io(stripe); + scrub_verify_one_stripe(stripe, old_error_bitmap); + if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) + goto out; } - bio_put(sbio->bio); - sbio->bio = NULL; - spin_lock(&sctx->list_lock); - sbio->next_free = sctx->first_free; - sctx->first_free = sbio->index; - spin_unlock(&sctx->list_lock); + /* + * Last safety net, try re-checking all mirrors, including the failed + * one, sector-by-sector. + * + * As if one sector failed the drive's internal csum, the whole read + * containing the offending sector would be marked as error. + * Thus here we do sector-by-sector read. + * + * This can be slow, thus we only try it as the last resort. + */ - if (sctx->is_dev_replace && sctx->flush_all_writes) { - mutex_lock(&sctx->wr_lock); - scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_lock); - } + for (i = 0, mirror = stripe->mirror_num; + i < num_copies; + i++, mirror = calc_next_mirror(mirror, num_copies)) { + const unsigned long old_error_bitmap = stripe->error_bitmap; - scrub_pending_bio_dec(sctx); + scrub_stripe_submit_repair_read(stripe, mirror, + fs_info->sectorsize, true); + wait_scrub_stripe_io(stripe); + scrub_verify_one_stripe(stripe, old_error_bitmap); + if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) + goto out; + } +out: + scrub_stripe_report_errors(stripe->sctx, stripe); + set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state); + wake_up(&stripe->repair_wait); } -static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, - unsigned long *bitmap, - u64 start, u32 len) +static void scrub_read_endio(struct btrfs_bio *bbio) { - u64 offset; - u32 nsectors; - u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits; + struct scrub_stripe *stripe = bbio->private; - if (len >= sparity->stripe_len) { - bitmap_set(bitmap, 0, sparity->nsectors); - return; + if (bbio->bio.bi_status) { + bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors); + bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors); + } else { + bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors); } - - start -= sparity->logic_start; - start = div64_u64_rem(start, sparity->stripe_len, &offset); - offset = offset >> sectorsize_bits; - nsectors = len >> sectorsize_bits; - - if (offset + nsectors <= sparity->nsectors) { - bitmap_set(bitmap, offset, nsectors); - return; + bio_put(&bbio->bio); + if (atomic_dec_and_test(&stripe->pending_io)) { + wake_up(&stripe->io_wait); + INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker); + queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work); } - - bitmap_set(bitmap, offset, sparity->nsectors - offset); - bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset)); -} - -static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity, - u64 start, u32 len) -{ - __scrub_mark_bitmap(sparity, &sparity->ebitmap, start, len); } -static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity, - u64 start, u32 len) +static void scrub_write_endio(struct btrfs_bio *bbio) { - __scrub_mark_bitmap(sparity, &sparity->dbitmap, start, len); -} - -static void scrub_block_complete(struct scrub_block *sblock) -{ - int corrupted = 0; + struct scrub_stripe *stripe = bbio->private; + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + struct bio_vec *bvec; + int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); + u32 bio_size = 0; + int i; - if (!sblock->no_io_error_seen) { - corrupted = 1; - scrub_handle_errored_block(sblock); - } else { - /* - * if has checksum error, write via repair mechanism in - * dev replace case, otherwise write here in dev replace - * case. - */ - corrupted = scrub_checksum(sblock); - if (!corrupted && sblock->sctx->is_dev_replace) - scrub_write_block_to_dev_replace(sblock); - } + bio_for_each_bvec_all(bvec, &bbio->bio, i) + bio_size += bvec->bv_len; - if (sblock->sparity && corrupted && !sblock->data_corrected) { - u64 start = sblock->logical; - u64 end = sblock->logical + - sblock->sectors[sblock->sector_count - 1]->offset + - sblock->sctx->fs_info->sectorsize; + if (bbio->bio.bi_status) { + unsigned long flags; - ASSERT(end - start <= U32_MAX); - scrub_parity_mark_sectors_error(sblock->sparity, - start, end - start); + spin_lock_irqsave(&stripe->write_error_lock, flags); + bitmap_set(&stripe->write_error_bitmap, sector_nr, + bio_size >> fs_info->sectorsize_bits); + spin_unlock_irqrestore(&stripe->write_error_lock, flags); } -} + bio_put(&bbio->bio); -static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum) -{ - sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits; - list_del(&sum->list); - kfree(sum); + if (atomic_dec_and_test(&stripe->pending_io)) + wake_up(&stripe->io_wait); } /* - * Find the desired csum for range [logical, logical + sectorsize), and store - * the csum into @csum. + * Submit the write bio(s) for the sectors specified by @write_bitmap. * - * The search source is sctx->csum_list, which is a pre-populated list - * storing bytenr ordered csum ranges. We're responsible to cleanup any range - * that is before @logical. + * Here we utilize btrfs_submit_repair_write(), which has some extra benefits: * - * Return 0 if there is no csum for the range. - * Return 1 if there is csum for the range and copied to @csum. + * - Only needs logical bytenr and mirror_num + * Just like the scrub read path + * + * - Would only result in writes to the specified mirror + * Unlike the regular writeback path, which would write back to all stripes + * + * - Handle dev-replace and read-repair writeback differently */ -static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum) +static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe, + unsigned long write_bitmap, bool dev_replace) { - bool found = false; + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + struct btrfs_bio *bbio = NULL; + const bool zoned = btrfs_is_zoned(fs_info); + int sector_nr; - while (!list_empty(&sctx->csum_list)) { - struct btrfs_ordered_sum *sum = NULL; - unsigned long index; - unsigned long num_sectors; - - sum = list_first_entry(&sctx->csum_list, - struct btrfs_ordered_sum, list); - /* The current csum range is beyond our range, no csum found */ - if (sum->bytenr > logical) - break; + for_each_set_bit(sector_nr, &write_bitmap, stripe->nr_sectors) { + struct page *page = scrub_stripe_get_page(stripe, sector_nr); + unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr); + int ret; - /* - * The current sum is before our bytenr, since scrub is always - * done in bytenr order, the csum will never be used anymore, - * clean it up so that later calls won't bother with the range, - * and continue search the next range. - */ - if (sum->bytenr + sum->len <= logical) { - drop_csum_range(sctx, sum); - continue; + /* We should only writeback sectors covered by an extent. */ + ASSERT(test_bit(sector_nr, &stripe->extent_sector_bitmap)); + + /* Cannot merge with previous sector, submit the current one. */ + if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) { + fill_writer_pointer_gap(sctx, stripe->physical + + (sector_nr << fs_info->sectorsize_bits)); + atomic_inc(&stripe->pending_io); + btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace); + /* For zoned writeback, queue depth must be 1. */ + if (zoned) + wait_scrub_stripe_io(stripe); + bbio = NULL; } - - /* Now the csum range covers our bytenr, copy the csum */ - found = true; - index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits; - num_sectors = sum->len >> sctx->fs_info->sectorsize_bits; - - memcpy(csum, sum->sums + index * sctx->fs_info->csum_size, - sctx->fs_info->csum_size); - - /* Cleanup the range if we're at the end of the csum range */ - if (index == num_sectors - 1) - drop_csum_range(sctx, sum); - break; + if (!bbio) { + bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_WRITE, + fs_info, scrub_write_endio, stripe); + bbio->bio.bi_iter.bi_sector = (stripe->logical + + (sector_nr << fs_info->sectorsize_bits)) >> + SECTOR_SHIFT; + } + ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); + ASSERT(ret == fs_info->sectorsize); + } + if (bbio) { + fill_writer_pointer_gap(sctx, bbio->bio.bi_iter.bi_sector << + SECTOR_SHIFT); + atomic_inc(&stripe->pending_io); + btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace); + if (zoned) + wait_scrub_stripe_io(stripe); } - if (!found) - return 0; - return 1; } -/* scrub extent tries to collect up to 64 kB for each bio */ -static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map, - u64 logical, u32 len, - u64 physical, struct btrfs_device *dev, u64 flags, - u64 gen, int mirror_num) +/* + * Throttling of IO submission, bandwidth-limit based, the timeslice is 1 + * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max. + */ +static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *device, + unsigned int bio_size) { - struct btrfs_device *src_dev = dev; - u64 src_physical = physical; - int src_mirror = mirror_num; - int ret; - u8 csum[BTRFS_CSUM_SIZE]; - u32 blocksize; + const int time_slice = 1000; + s64 delta; + ktime_t now; + u32 div; + u64 bwlimit; - if (flags & BTRFS_EXTENT_FLAG_DATA) { - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) - blocksize = map->stripe_len; - else - blocksize = sctx->fs_info->sectorsize; - spin_lock(&sctx->stat_lock); - sctx->stat.data_extents_scrubbed++; - sctx->stat.data_bytes_scrubbed += len; - spin_unlock(&sctx->stat_lock); - } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) - blocksize = map->stripe_len; - else - blocksize = sctx->fs_info->nodesize; - spin_lock(&sctx->stat_lock); - sctx->stat.tree_extents_scrubbed++; - sctx->stat.tree_bytes_scrubbed += len; - spin_unlock(&sctx->stat_lock); - } else { - blocksize = sctx->fs_info->sectorsize; - WARN_ON(1); - } + bwlimit = READ_ONCE(device->scrub_speed_max); + if (bwlimit == 0) + return; /* - * For dev-replace case, we can have @dev being a missing device. - * Regular scrub will avoid its execution on missing device at all, - * as that would trigger tons of read error. - * - * Reading from missing device will cause read error counts to - * increase unnecessarily. - * So here we change the read source to a good mirror. + * Slice is divided into intervals when the IO is submitted, adjust by + * bwlimit and maximum of 64 intervals. */ - if (sctx->is_dev_replace && !dev->bdev) - scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical, - &src_dev, &src_mirror); - while (len) { - u32 l = min(len, blocksize); - int have_csum = 0; - - if (flags & BTRFS_EXTENT_FLAG_DATA) { - /* push csums to sbio */ - have_csum = scrub_find_csum(sctx, logical, csum); - if (have_csum == 0) - ++sctx->stat.no_csum; - } - ret = scrub_sectors(sctx, logical, l, src_physical, src_dev, - flags, gen, src_mirror, - have_csum ? csum : NULL, physical); - if (ret) - return ret; - len -= l; - logical += l; - physical += l; - src_physical += l; - } - return 0; -} - -static int scrub_sectors_for_parity(struct scrub_parity *sparity, - u64 logical, u32 len, - u64 physical, struct btrfs_device *dev, - u64 flags, u64 gen, int mirror_num, u8 *csum) -{ - struct scrub_ctx *sctx = sparity->sctx; - struct scrub_block *sblock; - const u32 sectorsize = sctx->fs_info->sectorsize; - int index; - - ASSERT(IS_ALIGNED(len, sectorsize)); - - sblock = alloc_scrub_block(sctx, dev, logical, physical, physical, mirror_num); - if (!sblock) { - spin_lock(&sctx->stat_lock); - sctx->stat.malloc_errors++; - spin_unlock(&sctx->stat_lock); - return -ENOMEM; - } - - sblock->sparity = sparity; - scrub_parity_get(sparity); - - for (index = 0; len > 0; index++) { - struct scrub_sector *sector; - - sector = alloc_scrub_sector(sblock, logical); - if (!sector) { - spin_lock(&sctx->stat_lock); - sctx->stat.malloc_errors++; - spin_unlock(&sctx->stat_lock); - scrub_block_put(sblock); - return -ENOMEM; - } - sblock->sectors[index] = sector; - /* For scrub parity */ - scrub_sector_get(sector); - list_add_tail(§or->list, &sparity->sectors_list); - sector->flags = flags; - sector->generation = gen; - if (csum) { - sector->have_csum = 1; - memcpy(sector->csum, csum, sctx->fs_info->csum_size); - } else { - sector->have_csum = 0; - } - - /* Iterate over the stripe range in sectorsize steps */ - len -= sectorsize; - logical += sectorsize; - physical += sectorsize; - } - - WARN_ON(sblock->sector_count == 0); - for (index = 0; index < sblock->sector_count; index++) { - struct scrub_sector *sector = sblock->sectors[index]; - int ret; + div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024))); + div = min_t(u32, 64, div); - ret = scrub_add_sector_to_rd_bio(sctx, sector); - if (ret) { - scrub_block_put(sblock); - return ret; - } + /* Start new epoch, set deadline */ + now = ktime_get(); + if (sctx->throttle_deadline == 0) { + sctx->throttle_deadline = ktime_add_ms(now, time_slice / div); + sctx->throttle_sent = 0; } - /* Last one frees, either here or in bio completion for last sector */ - scrub_block_put(sblock); - return 0; -} - -static int scrub_extent_for_parity(struct scrub_parity *sparity, - u64 logical, u32 len, - u64 physical, struct btrfs_device *dev, - u64 flags, u64 gen, int mirror_num) -{ - struct scrub_ctx *sctx = sparity->sctx; - int ret; - u8 csum[BTRFS_CSUM_SIZE]; - u32 blocksize; - - if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) { - scrub_parity_mark_sectors_error(sparity, logical, len); - return 0; - } + /* Still in the time to send? */ + if (ktime_before(now, sctx->throttle_deadline)) { + /* If current bio is within the limit, send it */ + sctx->throttle_sent += bio_size; + if (sctx->throttle_sent <= div_u64(bwlimit, div)) + return; - if (flags & BTRFS_EXTENT_FLAG_DATA) { - blocksize = sparity->stripe_len; - } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - blocksize = sparity->stripe_len; + /* We're over the limit, sleep until the rest of the slice */ + delta = ktime_ms_delta(sctx->throttle_deadline, now); } else { - blocksize = sctx->fs_info->sectorsize; - WARN_ON(1); + /* New request after deadline, start new epoch */ + delta = 0; } - while (len) { - u32 l = min(len, blocksize); - int have_csum = 0; + if (delta) { + long timeout; - if (flags & BTRFS_EXTENT_FLAG_DATA) { - /* push csums to sbio */ - have_csum = scrub_find_csum(sctx, logical, csum); - if (have_csum == 0) - goto skip; - } - ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev, - flags, gen, mirror_num, - have_csum ? csum : NULL); - if (ret) - return ret; -skip: - len -= l; - logical += l; - physical += l; + timeout = div_u64(delta * HZ, 1000); + schedule_timeout_interruptible(timeout); } - return 0; + + /* Next call will start the deadline period */ + sctx->throttle_deadline = 0; } /* @@ -2908,10 +1266,7 @@ static int get_raid56_logic_offset(u64 physical, int num, { int i; int j = 0; - u64 stripe_nr; u64 last_offset; - u32 stripe_index; - u32 rot; const int data_stripes = nr_data_stripes(map); last_offset = (physical - map->stripes[num].physical) * data_stripes; @@ -2920,13 +1275,17 @@ static int get_raid56_logic_offset(u64 physical, int num, *offset = last_offset; for (i = 0; i < data_stripes; i++) { - *offset = last_offset + i * map->stripe_len; + u32 stripe_nr; + u32 stripe_index; + u32 rot; - stripe_nr = div64_u64(*offset, map->stripe_len); - stripe_nr = div_u64(stripe_nr, data_stripes); + *offset = last_offset + (i << BTRFS_STRIPE_LEN_SHIFT); + + stripe_nr = (u32)(*offset >> BTRFS_STRIPE_LEN_SHIFT) / data_stripes; /* Work out the disk rotation on this stripe-set */ - stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot); + rot = stripe_nr % map->num_stripes; + stripe_nr /= map->num_stripes; /* calculate which stripe this data locates */ rot += i; stripe_index = rot % map->num_stripes; @@ -2935,123 +1294,10 @@ static int get_raid56_logic_offset(u64 physical, int num, if (stripe_index < num) j++; } - *offset = last_offset + j * map->stripe_len; + *offset = last_offset + (j << BTRFS_STRIPE_LEN_SHIFT); return 1; } -static void scrub_free_parity(struct scrub_parity *sparity) -{ - struct scrub_ctx *sctx = sparity->sctx; - struct scrub_sector *curr, *next; - int nbits; - - nbits = bitmap_weight(&sparity->ebitmap, sparity->nsectors); - if (nbits) { - spin_lock(&sctx->stat_lock); - sctx->stat.read_errors += nbits; - sctx->stat.uncorrectable_errors += nbits; - spin_unlock(&sctx->stat_lock); - } - - list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) { - list_del_init(&curr->list); - scrub_sector_put(curr); - } - - kfree(sparity); -} - -static void scrub_parity_bio_endio_worker(struct work_struct *work) -{ - struct scrub_parity *sparity = container_of(work, struct scrub_parity, - work); - struct scrub_ctx *sctx = sparity->sctx; - - btrfs_bio_counter_dec(sctx->fs_info); - scrub_free_parity(sparity); - scrub_pending_bio_dec(sctx); -} - -static void scrub_parity_bio_endio(struct bio *bio) -{ - struct scrub_parity *sparity = bio->bi_private; - struct btrfs_fs_info *fs_info = sparity->sctx->fs_info; - - if (bio->bi_status) - bitmap_or(&sparity->ebitmap, &sparity->ebitmap, - &sparity->dbitmap, sparity->nsectors); - - bio_put(bio); - - INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker); - queue_work(fs_info->scrub_parity_workers, &sparity->work); -} - -static void scrub_parity_check_and_repair(struct scrub_parity *sparity) -{ - struct scrub_ctx *sctx = sparity->sctx; - struct btrfs_fs_info *fs_info = sctx->fs_info; - struct bio *bio; - struct btrfs_raid_bio *rbio; - struct btrfs_io_context *bioc = NULL; - u64 length; - int ret; - - if (!bitmap_andnot(&sparity->dbitmap, &sparity->dbitmap, - &sparity->ebitmap, sparity->nsectors)) - goto out; - - length = sparity->logic_end - sparity->logic_start; - - btrfs_bio_counter_inc_blocked(fs_info); - ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start, - &length, &bioc); - if (ret || !bioc || !bioc->raid_map) - goto bioc_out; - - bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); - bio->bi_iter.bi_sector = sparity->logic_start >> 9; - bio->bi_private = sparity; - bio->bi_end_io = scrub_parity_bio_endio; - - rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, - sparity->scrub_dev, - &sparity->dbitmap, - sparity->nsectors); - btrfs_put_bioc(bioc); - if (!rbio) - goto rbio_out; - - scrub_pending_bio_inc(sctx); - raid56_parity_submit_scrub_rbio(rbio); - return; - -rbio_out: - bio_put(bio); -bioc_out: - btrfs_bio_counter_dec(fs_info); - bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap, - sparity->nsectors); - spin_lock(&sctx->stat_lock); - sctx->stat.malloc_errors++; - spin_unlock(&sctx->stat_lock); -out: - scrub_free_parity(sparity); -} - -static void scrub_parity_get(struct scrub_parity *sparity) -{ - refcount_inc(&sparity->refs); -} - -static void scrub_parity_put(struct scrub_parity *sparity) -{ - if (!refcount_dec_and_test(&sparity->refs)) - return; - - scrub_parity_check_and_repair(sparity); -} - /* * Return 0 if the extent item range covers any byte of the range. * Return <0 if the extent item is before @search_start. @@ -3178,226 +1424,533 @@ static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret, *generation_ret = btrfs_extent_generation(path->nodes[0], ei); } -static bool does_range_cross_boundary(u64 extent_start, u64 extent_len, - u64 boundary_start, u64 boudary_len) +static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, + u64 physical, u64 physical_end) { - return (extent_start < boundary_start && - extent_start + extent_len > boundary_start) || - (extent_start < boundary_start + boudary_len && - extent_start + extent_len > boundary_start + boudary_len); + struct btrfs_fs_info *fs_info = sctx->fs_info; + int ret = 0; + + if (!btrfs_is_zoned(fs_info)) + return 0; + + mutex_lock(&sctx->wr_lock); + if (sctx->write_pointer < physical_end) { + ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical, + physical, + sctx->write_pointer); + if (ret) + btrfs_err(fs_info, + "zoned: failed to recover write pointer"); + } + mutex_unlock(&sctx->wr_lock); + btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical); + + return ret; } -static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx, - struct scrub_parity *sparity, - struct map_lookup *map, - struct btrfs_device *sdev, - struct btrfs_path *path, - u64 logical) +static void fill_one_extent_info(struct btrfs_fs_info *fs_info, + struct scrub_stripe *stripe, + u64 extent_start, u64 extent_len, + u64 extent_flags, u64 extent_gen) +{ + for (u64 cur_logical = max(stripe->logical, extent_start); + cur_logical < min(stripe->logical + BTRFS_STRIPE_LEN, + extent_start + extent_len); + cur_logical += fs_info->sectorsize) { + const int nr_sector = (cur_logical - stripe->logical) >> + fs_info->sectorsize_bits; + struct scrub_sector_verification *sector = + &stripe->sectors[nr_sector]; + + set_bit(nr_sector, &stripe->extent_sector_bitmap); + if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + sector->is_metadata = true; + sector->generation = extent_gen; + } + } +} + +static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe) { - struct btrfs_fs_info *fs_info = sctx->fs_info; - struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical); - struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical); - u64 cur_logical = logical; + stripe->extent_sector_bitmap = 0; + stripe->init_error_bitmap = 0; + stripe->error_bitmap = 0; + stripe->io_error_bitmap = 0; + stripe->csum_error_bitmap = 0; + stripe->meta_error_bitmap = 0; +} + +/* + * Locate one stripe which has at least one extent in its range. + * + * Return 0 if found such stripe, and store its info into @stripe. + * Return >0 if there is no such stripe in the specified range. + * Return <0 for error. + */ +static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, + struct btrfs_device *dev, u64 physical, + int mirror_num, u64 logical_start, + u32 logical_len, + struct scrub_stripe *stripe) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start); + struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start); + const u64 logical_end = logical_start + logical_len; + struct btrfs_path path = { 0 }; + u64 cur_logical = logical_start; + u64 stripe_end; + u64 extent_start; + u64 extent_len; + u64 extent_flags; + u64 extent_gen; int ret; - ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK); + memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) * + stripe->nr_sectors); + scrub_stripe_reset_bitmaps(stripe); - /* Path must not be populated */ - ASSERT(!path->nodes[0]); + /* The range must be inside the bg. */ + ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length); - while (cur_logical < logical + map->stripe_len) { - struct btrfs_io_context *bioc = NULL; - struct btrfs_device *extent_dev; - u64 extent_start; - u64 extent_size; - u64 mapped_length; - u64 extent_flags; - u64 extent_gen; - u64 extent_physical; - u64 extent_mirror_num; - - ret = find_first_extent_item(extent_root, path, cur_logical, - logical + map->stripe_len - cur_logical); - /* No more extent item in this data stripe */ + path.search_commit_root = 1; + path.skip_locking = 1; + + ret = find_first_extent_item(extent_root, &path, logical_start, logical_len); + /* Either error or not found. */ + if (ret) + goto out; + get_extent_info(&path, &extent_start, &extent_len, &extent_flags, &extent_gen); + if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) + stripe->nr_meta_extents++; + if (extent_flags & BTRFS_EXTENT_FLAG_DATA) + stripe->nr_data_extents++; + cur_logical = max(extent_start, cur_logical); + + /* + * Round down to stripe boundary. + * + * The extra calculation against bg->start is to handle block groups + * whose logical bytenr is not BTRFS_STRIPE_LEN aligned. + */ + stripe->logical = round_down(cur_logical - bg->start, BTRFS_STRIPE_LEN) + + bg->start; + stripe->physical = physical + stripe->logical - logical_start; + stripe->dev = dev; + stripe->bg = bg; + stripe->mirror_num = mirror_num; + stripe_end = stripe->logical + BTRFS_STRIPE_LEN - 1; + + /* Fill the first extent info into stripe->sectors[] array. */ + fill_one_extent_info(fs_info, stripe, extent_start, extent_len, + extent_flags, extent_gen); + cur_logical = extent_start + extent_len; + + /* Fill the extent info for the remaining sectors. */ + while (cur_logical <= stripe_end) { + ret = find_first_extent_item(extent_root, &path, cur_logical, + stripe_end - cur_logical + 1); + if (ret < 0) + goto out; if (ret > 0) { ret = 0; break; } + get_extent_info(&path, &extent_start, &extent_len, + &extent_flags, &extent_gen); + if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) + stripe->nr_meta_extents++; + if (extent_flags & BTRFS_EXTENT_FLAG_DATA) + stripe->nr_data_extents++; + fill_one_extent_info(fs_info, stripe, extent_start, extent_len, + extent_flags, extent_gen); + cur_logical = extent_start + extent_len; + } + + /* Now fill the data csum. */ + if (bg->flags & BTRFS_BLOCK_GROUP_DATA) { + int sector_nr; + unsigned long csum_bitmap = 0; + + /* Csum space should have already been allocated. */ + ASSERT(stripe->csums); + + /* + * Our csum bitmap should be large enough, as BTRFS_STRIPE_LEN + * should contain at most 16 sectors. + */ + ASSERT(BITS_PER_LONG >= BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); + + ret = btrfs_lookup_csums_bitmap(csum_root, stripe->logical, + stripe_end, stripe->csums, + &csum_bitmap, true); if (ret < 0) - break; - get_extent_info(path, &extent_start, &extent_size, &extent_flags, - &extent_gen); + goto out; + if (ret > 0) + ret = 0; + + for_each_set_bit(sector_nr, &csum_bitmap, stripe->nr_sectors) { + stripe->sectors[sector_nr].csum = stripe->csums + + sector_nr * fs_info->csum_size; + } + } + set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); +out: + btrfs_release_path(&path); + return ret; +} + +static void scrub_reset_stripe(struct scrub_stripe *stripe) +{ + scrub_stripe_reset_bitmaps(stripe); + + stripe->nr_meta_extents = 0; + stripe->nr_data_extents = 0; + stripe->state = 0; + + for (int i = 0; i < stripe->nr_sectors; i++) { + stripe->sectors[i].is_metadata = false; + stripe->sectors[i].csum = NULL; + stripe->sectors[i].generation = 0; + } +} + +static void scrub_submit_initial_read(struct scrub_ctx *sctx, + struct scrub_stripe *stripe) +{ + struct btrfs_fs_info *fs_info = sctx->fs_info; + struct btrfs_bio *bbio; + int mirror = stripe->mirror_num; + + ASSERT(stripe->bg); + ASSERT(stripe->mirror_num > 0); + ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); + + bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info, + scrub_read_endio, stripe); + + /* Read the whole stripe. */ + bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT; + for (int i = 0; i < BTRFS_STRIPE_LEN >> PAGE_SHIFT; i++) { + int ret; + + ret = bio_add_page(&bbio->bio, stripe->pages[i], PAGE_SIZE, 0); + /* We should have allocated enough bio vectors. */ + ASSERT(ret == PAGE_SIZE); + } + atomic_inc(&stripe->pending_io); + + /* + * For dev-replace, either user asks to avoid the source dev, or + * the device is missing, we try the next mirror instead. + */ + if (sctx->is_dev_replace && + (fs_info->dev_replace.cont_reading_from_srcdev_mode == + BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID || + !stripe->dev->bdev)) { + int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, + stripe->bg->length); + + mirror = calc_next_mirror(mirror, num_copies); + } + btrfs_submit_bio(bbio, mirror); +} + +static bool stripe_has_metadata_error(struct scrub_stripe *stripe) +{ + int i; + + for_each_set_bit(i, &stripe->error_bitmap, stripe->nr_sectors) { + if (stripe->sectors[i].is_metadata) { + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; - /* Metadata should not cross stripe boundaries */ - if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && - does_range_cross_boundary(extent_start, extent_size, - logical, map->stripe_len)) { btrfs_err(fs_info, - "scrub: tree block %llu spanning stripes, ignored. logical=%llu", - extent_start, logical); - spin_lock(&sctx->stat_lock); - sctx->stat.uncorrectable_errors++; - spin_unlock(&sctx->stat_lock); - cur_logical += extent_size; - continue; + "stripe %llu has unrepaired metadata sector at %llu", + stripe->logical, + stripe->logical + (i << fs_info->sectorsize_bits)); + return true; } + } + return false; +} - /* Skip hole range which doesn't have any extent */ - cur_logical = max(extent_start, cur_logical); +static int flush_scrub_stripes(struct scrub_ctx *sctx) +{ + struct btrfs_fs_info *fs_info = sctx->fs_info; + struct scrub_stripe *stripe; + const int nr_stripes = sctx->cur_stripe; + int ret = 0; - /* Truncate the range inside this data stripe */ - extent_size = min(extent_start + extent_size, - logical + map->stripe_len) - cur_logical; - extent_start = cur_logical; - ASSERT(extent_size <= U32_MAX); + if (!nr_stripes) + return 0; - scrub_parity_mark_sectors_data(sparity, extent_start, extent_size); + ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state)); - mapped_length = extent_size; - ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start, - &mapped_length, &bioc, 0); - if (!ret && (!bioc || mapped_length < extent_size)) - ret = -EIO; - if (ret) { - btrfs_put_bioc(bioc); - scrub_parity_mark_sectors_error(sparity, extent_start, - extent_size); - break; + scrub_throttle_dev_io(sctx, sctx->stripes[0].dev, + nr_stripes << BTRFS_STRIPE_LEN_SHIFT); + for (int i = 0; i < nr_stripes; i++) { + stripe = &sctx->stripes[i]; + scrub_submit_initial_read(sctx, stripe); + } + + for (int i = 0; i < nr_stripes; i++) { + stripe = &sctx->stripes[i]; + + wait_event(stripe->repair_wait, + test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state)); + } + + /* + * Submit the repaired sectors. For zoned case, we cannot do repair + * in-place, but queue the bg to be relocated. + */ + if (btrfs_is_zoned(fs_info)) { + for (int i = 0; i < nr_stripes; i++) { + stripe = &sctx->stripes[i]; + + if (!bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) { + btrfs_repair_one_zone(fs_info, + sctx->stripes[0].bg->start); + break; + } } - extent_physical = bioc->stripes[0].physical; - extent_mirror_num = bioc->mirror_num; - extent_dev = bioc->stripes[0].dev; - btrfs_put_bioc(bioc); + } else { + for (int i = 0; i < nr_stripes; i++) { + unsigned long repaired; - ret = btrfs_lookup_csums_list(csum_root, extent_start, - extent_start + extent_size - 1, - &sctx->csum_list, 1, false); - if (ret) { - scrub_parity_mark_sectors_error(sparity, extent_start, - extent_size); - break; + stripe = &sctx->stripes[i]; + + bitmap_andnot(&repaired, &stripe->init_error_bitmap, + &stripe->error_bitmap, stripe->nr_sectors); + scrub_write_sectors(sctx, stripe, repaired, false); + } + } + + /* Submit for dev-replace. */ + if (sctx->is_dev_replace) { + /* + * For dev-replace, if we know there is something wrong with + * metadata, we should immedately abort. + */ + for (int i = 0; i < nr_stripes; i++) { + if (stripe_has_metadata_error(&sctx->stripes[i])) { + ret = -EIO; + goto out; + } } + for (int i = 0; i < nr_stripes; i++) { + unsigned long good; - ret = scrub_extent_for_parity(sparity, extent_start, - extent_size, extent_physical, - extent_dev, extent_flags, - extent_gen, extent_mirror_num); - scrub_free_csums(sctx); + stripe = &sctx->stripes[i]; - if (ret) { - scrub_parity_mark_sectors_error(sparity, extent_start, - extent_size); - break; + ASSERT(stripe->dev == fs_info->dev_replace.srcdev); + + bitmap_andnot(&good, &stripe->extent_sector_bitmap, + &stripe->error_bitmap, stripe->nr_sectors); + scrub_write_sectors(sctx, stripe, good, true); } + } - cond_resched(); - cur_logical += extent_size; + /* Wait for the above writebacks to finish. */ + for (int i = 0; i < nr_stripes; i++) { + stripe = &sctx->stripes[i]; + + wait_scrub_stripe_io(stripe); + scrub_reset_stripe(stripe); } - btrfs_release_path(path); +out: + sctx->cur_stripe = 0; return ret; } -static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, - struct map_lookup *map, - struct btrfs_device *sdev, - u64 logic_start, - u64 logic_end) +static void raid56_scrub_wait_endio(struct bio *bio) { - struct btrfs_fs_info *fs_info = sctx->fs_info; - struct btrfs_path *path; - u64 cur_logical; + complete(bio->bi_private); +} + +static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, + struct btrfs_device *dev, int mirror_num, + u64 logical, u32 length, u64 physical) +{ + struct scrub_stripe *stripe; int ret; - struct scrub_parity *sparity; - int nsectors; - path = btrfs_alloc_path(); - if (!path) { - spin_lock(&sctx->stat_lock); - sctx->stat.malloc_errors++; - spin_unlock(&sctx->stat_lock); - return -ENOMEM; + /* No available slot, submit all stripes and wait for them. */ + if (sctx->cur_stripe >= SCRUB_STRIPES_PER_SCTX) { + ret = flush_scrub_stripes(sctx); + if (ret < 0) + return ret; } - path->search_commit_root = 1; - path->skip_locking = 1; - ASSERT(map->stripe_len <= U32_MAX); - nsectors = map->stripe_len >> fs_info->sectorsize_bits; - ASSERT(nsectors <= BITS_PER_LONG); - sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS); - if (!sparity) { - spin_lock(&sctx->stat_lock); - sctx->stat.malloc_errors++; - spin_unlock(&sctx->stat_lock); - btrfs_free_path(path); - return -ENOMEM; - } + stripe = &sctx->stripes[sctx->cur_stripe]; - ASSERT(map->stripe_len <= U32_MAX); - sparity->stripe_len = map->stripe_len; - sparity->nsectors = nsectors; - sparity->sctx = sctx; - sparity->scrub_dev = sdev; - sparity->logic_start = logic_start; - sparity->logic_end = logic_end; - refcount_set(&sparity->refs, 1); - INIT_LIST_HEAD(&sparity->sectors_list); + /* We can queue one stripe using the remaining slot. */ + scrub_reset_stripe(stripe); + ret = scrub_find_fill_first_stripe(bg, dev, physical, mirror_num, + logical, length, stripe); + /* Either >0 as no more extents or <0 for error. */ + if (ret) + return ret; + sctx->cur_stripe++; + return 0; +} - ret = 0; - for (cur_logical = logic_start; cur_logical < logic_end; - cur_logical += map->stripe_len) { - ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map, - sdev, path, cur_logical); +static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, + struct btrfs_device *scrub_dev, + struct btrfs_block_group *bg, + struct map_lookup *map, + u64 full_stripe_start) +{ + DECLARE_COMPLETION_ONSTACK(io_done); + struct btrfs_fs_info *fs_info = sctx->fs_info; + struct btrfs_raid_bio *rbio; + struct btrfs_io_context *bioc = NULL; + struct bio *bio; + struct scrub_stripe *stripe; + bool all_empty = true; + const int data_stripes = nr_data_stripes(map); + unsigned long extent_bitmap = 0; + u64 length = data_stripes << BTRFS_STRIPE_LEN_SHIFT; + int ret; + + ASSERT(sctx->raid56_data_stripes); + + for (int i = 0; i < data_stripes; i++) { + int stripe_index; + int rot; + u64 physical; + + stripe = &sctx->raid56_data_stripes[i]; + rot = div_u64(full_stripe_start - bg->start, + data_stripes) >> BTRFS_STRIPE_LEN_SHIFT; + stripe_index = (i + rot) % map->num_stripes; + physical = map->stripes[stripe_index].physical + + (rot << BTRFS_STRIPE_LEN_SHIFT); + + scrub_reset_stripe(stripe); + set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state); + ret = scrub_find_fill_first_stripe(bg, + map->stripes[stripe_index].dev, physical, 1, + full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT), + BTRFS_STRIPE_LEN, stripe); if (ret < 0) + goto out; + /* + * No extent in this data stripe, need to manually mark them + * initialized to make later read submission happy. + */ + if (ret > 0) { + stripe->logical = full_stripe_start + + (i << BTRFS_STRIPE_LEN_SHIFT); + stripe->dev = map->stripes[stripe_index].dev; + stripe->mirror_num = 1; + set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); + } + } + + /* Check if all data stripes are empty. */ + for (int i = 0; i < data_stripes; i++) { + stripe = &sctx->raid56_data_stripes[i]; + if (!bitmap_empty(&stripe->extent_sector_bitmap, stripe->nr_sectors)) { + all_empty = false; break; + } + } + if (all_empty) { + ret = 0; + goto out; } - scrub_parity_put(sparity); - scrub_submit(sctx); - mutex_lock(&sctx->wr_lock); - scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_lock); + for (int i = 0; i < data_stripes; i++) { + stripe = &sctx->raid56_data_stripes[i]; + scrub_submit_initial_read(sctx, stripe); + } + for (int i = 0; i < data_stripes; i++) { + stripe = &sctx->raid56_data_stripes[i]; - btrfs_free_path(path); - return ret < 0 ? ret : 0; -} + wait_event(stripe->repair_wait, + test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state)); + } + /* For now, no zoned support for RAID56. */ + ASSERT(!btrfs_is_zoned(sctx->fs_info)); -static void sync_replace_for_zoned(struct scrub_ctx *sctx) -{ - if (!btrfs_is_zoned(sctx->fs_info)) - return; + /* Writeback for the repaired sectors. */ + for (int i = 0; i < data_stripes; i++) { + unsigned long repaired; - sctx->flush_all_writes = true; - scrub_submit(sctx); - mutex_lock(&sctx->wr_lock); - scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_lock); + stripe = &sctx->raid56_data_stripes[i]; - wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); -} + bitmap_andnot(&repaired, &stripe->init_error_bitmap, + &stripe->error_bitmap, stripe->nr_sectors); + scrub_write_sectors(sctx, stripe, repaired, false); + } -static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, - u64 physical, u64 physical_end) -{ - struct btrfs_fs_info *fs_info = sctx->fs_info; - int ret = 0; + /* Wait for the above writebacks to finish. */ + for (int i = 0; i < data_stripes; i++) { + stripe = &sctx->raid56_data_stripes[i]; - if (!btrfs_is_zoned(fs_info)) - return 0; + wait_scrub_stripe_io(stripe); + } - wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); + /* + * Now all data stripes are properly verified. Check if we have any + * unrepaired, if so abort immediately or we could further corrupt the + * P/Q stripes. + * + * During the loop, also populate extent_bitmap. + */ + for (int i = 0; i < data_stripes; i++) { + unsigned long error; - mutex_lock(&sctx->wr_lock); - if (sctx->write_pointer < physical_end) { - ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical, - physical, - sctx->write_pointer); - if (ret) + stripe = &sctx->raid56_data_stripes[i]; + + /* + * We should only check the errors where there is an extent. + * As we may hit an empty data stripe while it's missing. + */ + bitmap_and(&error, &stripe->error_bitmap, + &stripe->extent_sector_bitmap, stripe->nr_sectors); + if (!bitmap_empty(&error, stripe->nr_sectors)) { btrfs_err(fs_info, - "zoned: failed to recover write pointer"); +"unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", + full_stripe_start, i, stripe->nr_sectors, + &error); + ret = -EIO; + goto out; + } + bitmap_or(&extent_bitmap, &extent_bitmap, + &stripe->extent_sector_bitmap, stripe->nr_sectors); } - mutex_unlock(&sctx->wr_lock); - btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical); + /* Now we can check and regenerate the P/Q stripe. */ + bio = bio_alloc(NULL, 1, REQ_OP_READ, GFP_NOFS); + bio->bi_iter.bi_sector = full_stripe_start >> SECTOR_SHIFT; + bio->bi_private = &io_done; + bio->bi_end_io = raid56_scrub_wait_endio; + + btrfs_bio_counter_inc_blocked(fs_info); + ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, full_stripe_start, + &length, &bioc); + if (ret < 0) { + btrfs_put_bioc(bioc); + btrfs_bio_counter_dec(fs_info); + goto out; + } + rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, scrub_dev, &extent_bitmap, + BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); + btrfs_put_bioc(bioc); + if (!rbio) { + ret = -ENOMEM; + btrfs_bio_counter_dec(fs_info); + goto out; + } + raid56_parity_submit_scrub_rbio(rbio); + wait_for_completion_io(&io_done); + ret = blk_status_to_errno(bio->bi_status); + bio_put(bio); + btrfs_bio_counter_dec(fs_info); + +out: return ret; } @@ -3410,8 +1963,6 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, * and @logical_length parameter. */ static int scrub_simple_mirror(struct scrub_ctx *sctx, - struct btrfs_root *extent_root, - struct btrfs_root *csum_root, struct btrfs_block_group *bg, struct map_lookup *map, u64 logical_start, u64 logical_length, @@ -3421,7 +1972,6 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, struct btrfs_fs_info *fs_info = sctx->fs_info; const u64 logical_end = logical_start + logical_length; /* An artificial limit, inherit from old scrub behavior */ - const u32 max_length = SZ_64K; struct btrfs_path path = { 0 }; u64 cur_logical = logical_start; int ret; @@ -3433,11 +1983,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, path.skip_locking = 1; /* Go through each extent items inside the logical range */ while (cur_logical < logical_end) { - u64 extent_start; - u64 extent_len; - u64 extent_flags; - u64 extent_gen; - u64 scrub_len; + u64 cur_physical = physical + cur_logical - logical_start; /* Canceled? */ if (atomic_read(&fs_info->scrub_cancel_req) || @@ -3448,14 +1994,6 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, /* Paused? */ if (atomic_read(&fs_info->scrub_pause_req)) { /* Push queued extents */ - sctx->flush_all_writes = true; - scrub_submit(sctx); - mutex_lock(&sctx->wr_lock); - scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_lock); - wait_event(sctx->list_wait, - atomic_read(&sctx->bios_in_flight) == 0); - sctx->flush_all_writes = false; scrub_blocked_if_needed(fs_info); } /* Block group removed? */ @@ -3467,8 +2005,9 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, } spin_unlock(&bg->lock); - ret = find_first_extent_item(extent_root, &path, cur_logical, - logical_end - cur_logical); + ret = queue_scrub_stripe(sctx, bg, device, mirror_num, + cur_logical, logical_end - cur_logical, + cur_physical); if (ret > 0) { /* No more extent, just update the accounting */ sctx->stat.last_physical = physical + logical_length; @@ -3477,52 +2016,11 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, } if (ret < 0) break; - get_extent_info(&path, &extent_start, &extent_len, - &extent_flags, &extent_gen); - /* Skip hole range which doesn't have any extent */ - cur_logical = max(extent_start, cur_logical); - /* - * Scrub len has three limits: - * - Extent size limit - * - Scrub range limit - * This is especially imporatant for RAID0/RAID10 to reuse - * this function - * - Max scrub size limit - */ - scrub_len = min(min(extent_start + extent_len, - logical_end), cur_logical + max_length) - - cur_logical; - - if (extent_flags & BTRFS_EXTENT_FLAG_DATA) { - ret = btrfs_lookup_csums_list(csum_root, cur_logical, - cur_logical + scrub_len - 1, - &sctx->csum_list, 1, false); - if (ret) - break; - } - if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && - does_range_cross_boundary(extent_start, extent_len, - logical_start, logical_length)) { - btrfs_err(fs_info, -"scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)", - extent_start, logical_start, logical_end); - spin_lock(&sctx->stat_lock); - sctx->stat.uncorrectable_errors++; - spin_unlock(&sctx->stat_lock); - cur_logical += scrub_len; - continue; - } - ret = scrub_extent(sctx, map, cur_logical, scrub_len, - cur_logical - logical_start + physical, - device, extent_flags, extent_gen, - mirror_num); - scrub_free_csums(sctx); - if (ret) - break; - if (sctx->is_dev_replace) - sync_replace_for_zoned(sctx); - cur_logical += scrub_len; + ASSERT(sctx->cur_stripe > 0); + cur_logical = sctx->stripes[sctx->cur_stripe - 1].logical + + BTRFS_STRIPE_LEN; + /* Don't hold CPU for too long time */ cond_resched(); } @@ -3536,7 +2034,7 @@ static u64 simple_stripe_full_stripe_len(const struct map_lookup *map) ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)); - return map->num_stripes / map->sub_stripes * map->stripe_len; + return (map->num_stripes / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT; } /* Get the logical bytenr for the stripe */ @@ -3552,7 +2050,8 @@ static u64 simple_stripe_get_logical(struct map_lookup *map, * (stripe_index / sub_stripes) gives how many data stripes we need to * skip. */ - return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start; + return ((stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT) + + bg->start; } /* Get the mirror number for the stripe */ @@ -3567,8 +2066,6 @@ static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index) } static int scrub_simple_stripe(struct scrub_ctx *sctx, - struct btrfs_root *extent_root, - struct btrfs_root *csum_root, struct btrfs_block_group *bg, struct map_lookup *map, struct btrfs_device *device, @@ -3588,15 +2085,15 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx, * just RAID1, so we can reuse scrub_simple_mirror() to scrub * this stripe. */ - ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map, - cur_logical, map->stripe_len, device, - cur_physical, mirror_num); + ret = scrub_simple_mirror(sctx, bg, map, cur_logical, + BTRFS_STRIPE_LEN, device, cur_physical, + mirror_num); if (ret) return ret; /* Skip to next stripe which belongs to the target device */ cur_logical += logical_increment; /* For physical offset, we just go to next stripe */ - cur_physical += map->stripe_len; + cur_physical += BTRFS_STRIPE_LEN; } return ret; } @@ -3607,15 +2104,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, int stripe_index) { - struct btrfs_path *path; struct btrfs_fs_info *fs_info = sctx->fs_info; - struct btrfs_root *root; - struct btrfs_root *csum_root; - struct blk_plug plug; struct map_lookup *map = em->map_lookup; const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; const u64 chunk_logical = bg->start; int ret; + int ret2; u64 physical = map->stripes[stripe_index].physical; const u64 dev_stripe_len = btrfs_calc_stripe_length(em); const u64 physical_end = physical + dev_stripe_len; @@ -3626,43 +2120,37 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, /* Offset inside the chunk */ u64 offset; u64 stripe_logical; - u64 stripe_end; int stop_loop = 0; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - /* - * work on commit root. The related disk blocks are static as - * long as COW is applied. This means, it is save to rewrite - * them to repair disk errors without any race conditions - */ - path->search_commit_root = 1; - path->skip_locking = 1; - path->reada = READA_FORWARD; - - wait_event(sctx->list_wait, - atomic_read(&sctx->bios_in_flight) == 0); scrub_blocked_if_needed(fs_info); - root = btrfs_extent_root(fs_info, bg->start); - csum_root = btrfs_csum_root(fs_info, bg->start); - - /* - * collect all data csums for the stripe to avoid seeking during - * the scrub. This might currently (crc32) end up to be about 1MB - */ - blk_start_plug(&plug); - if (sctx->is_dev_replace && btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) { mutex_lock(&sctx->wr_lock); sctx->write_pointer = physical; mutex_unlock(&sctx->wr_lock); - sctx->flush_all_writes = true; } + /* Prepare the extra data stripes used by RAID56. */ + if (profile & BTRFS_BLOCK_GROUP_RAID56_MASK) { + ASSERT(sctx->raid56_data_stripes == NULL); + + sctx->raid56_data_stripes = kcalloc(nr_data_stripes(map), + sizeof(struct scrub_stripe), + GFP_KERNEL); + if (!sctx->raid56_data_stripes) { + ret = -ENOMEM; + goto out; + } + for (int i = 0; i < nr_data_stripes(map); i++) { + ret = init_scrub_stripe(fs_info, + &sctx->raid56_data_stripes[i]); + if (ret < 0) + goto out; + sctx->raid56_data_stripes[i].bg = bg; + sctx->raid56_data_stripes[i].sctx = sctx; + } + } /* * There used to be a big double loop to handle all profiles using the * same routine, which grows larger and more gross over time. @@ -3680,17 +2168,15 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, * Only @physical and @mirror_num needs to calculated using * @stripe_index. */ - ret = scrub_simple_mirror(sctx, root, csum_root, bg, map, - bg->start, bg->length, scrub_dev, - map->stripes[stripe_index].physical, + ret = scrub_simple_mirror(sctx, bg, map, bg->start, bg->length, + scrub_dev, map->stripes[stripe_index].physical, stripe_index + 1); offset = 0; goto out; } if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { - ret = scrub_simple_stripe(sctx, root, csum_root, bg, map, - scrub_dev, stripe_index); - offset = map->stripe_len * (stripe_index / map->sub_stripes); + ret = scrub_simple_stripe(sctx, bg, map, scrub_dev, stripe_index); + offset = (stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT; goto out; } @@ -3705,7 +2191,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, /* Initialize @offset in case we need to go to out: label */ get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL); - increment = map->stripe_len * nr_data_stripes(map); + increment = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT; /* * Due to the rotation, for RAID56 it's better to iterate each stripe @@ -3718,10 +2204,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, if (ret) { /* it is parity strip */ stripe_logical += chunk_logical; - stripe_end = stripe_logical + increment; - ret = scrub_raid56_parity(sctx, map, scrub_dev, - stripe_logical, - stripe_end); + ret = scrub_raid56_parity_stripe(sctx, scrub_dev, bg, + map, stripe_logical); if (ret) goto out; goto next; @@ -3735,14 +2219,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, * We can reuse scrub_simple_mirror() here, as the repair part * is still based on @mirror_num. */ - ret = scrub_simple_mirror(sctx, root, csum_root, bg, map, - logical, map->stripe_len, + ret = scrub_simple_mirror(sctx, bg, map, logical, BTRFS_STRIPE_LEN, scrub_dev, physical, 1); if (ret < 0) goto out; next: logical += increment; - physical += map->stripe_len; + physical += BTRFS_STRIPE_LEN; spin_lock(&sctx->stat_lock); if (stop_loop) sctx->stat.last_physical = @@ -3754,14 +2237,15 @@ next: break; } out: - /* push queued extents */ - scrub_submit(sctx); - mutex_lock(&sctx->wr_lock); - scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_lock); - - blk_finish_plug(&plug); - btrfs_free_path(path); + ret2 = flush_scrub_stripes(sctx); + if (!ret2) + ret = ret2; + if (sctx->raid56_data_stripes) { + for (int i = 0; i < nr_data_stripes(map); i++) + release_scrub_stripe(&sctx->raid56_data_stripes[i]); + kfree(sctx->raid56_data_stripes); + sctx->raid56_data_stripes = NULL; + } if (sctx->is_dev_replace && ret >= 0) { int ret2; @@ -4079,39 +2563,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset, dev_extent_len); - - /* - * flush, submit all pending read and write bios, afterwards - * wait for them. - * Note that in the dev replace case, a read request causes - * write requests that are submitted in the read completion - * worker. Therefore in the current situation, it is required - * that all write requests are flushed, so that all read and - * write requests are really completed when bios_in_flight - * changes to 0. - */ - sctx->flush_all_writes = true; - scrub_submit(sctx); - mutex_lock(&sctx->wr_lock); - scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_lock); - - wait_event(sctx->list_wait, - atomic_read(&sctx->bios_in_flight) == 0); - - scrub_pause_on(fs_info); - - /* - * must be called before we decrease @scrub_paused. - * make sure we don't block transaction commit while - * we are waiting pending workers finished. - */ - wait_event(sctx->list_wait, - atomic_read(&sctx->workers_pending) == 0); - sctx->flush_all_writes = false; - - scrub_pause_off(fs_info); - if (sctx->is_dev_replace && !btrfs_finish_block_group_to_copy(dev_replace->srcdev, cache, found_key.offset)) @@ -4168,18 +2619,62 @@ skip: return ret; } +static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev, + struct page *page, u64 physical, u64 generation) +{ + struct btrfs_fs_info *fs_info = sctx->fs_info; + struct bio_vec bvec; + struct bio bio; + struct btrfs_super_block *sb = page_address(page); + int ret; + + bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_READ); + bio.bi_iter.bi_sector = physical >> SECTOR_SHIFT; + __bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE, 0); + ret = submit_bio_wait(&bio); + bio_uninit(&bio); + + if (ret < 0) + return ret; + ret = btrfs_check_super_csum(fs_info, sb); + if (ret != 0) { + btrfs_err_rl(fs_info, + "super block at physical %llu devid %llu has bad csum", + physical, dev->devid); + return -EIO; + } + if (btrfs_super_generation(sb) != generation) { + btrfs_err_rl(fs_info, +"super block at physical %llu devid %llu has bad generation %llu expect %llu", + physical, dev->devid, + btrfs_super_generation(sb), generation); + return -EUCLEAN; + } + + return btrfs_validate_super(fs_info, sb, -1); +} + static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev) { int i; u64 bytenr; u64 gen; - int ret; + int ret = 0; + struct page *page; struct btrfs_fs_info *fs_info = sctx->fs_info; if (BTRFS_FS_ERROR(fs_info)) return -EROFS; + page = alloc_page(GFP_KERNEL); + if (!page) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + return -ENOMEM; + } + /* Seed devices of a new filesystem has their own generation. */ if (scrub_dev->fs_devices != fs_info->fs_devices) gen = scrub_dev->generation; @@ -4194,14 +2689,14 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, if (!btrfs_check_super_location(scrub_dev, bytenr)) continue; - ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, - scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, - NULL, bytenr); - if (ret) - return ret; + ret = scrub_one_super(sctx, scrub_dev, page, bytenr, gen); + if (ret) { + spin_lock(&sctx->stat_lock); + sctx->stat.super_errors++; + spin_unlock(&sctx->stat_lock); + } } - wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); - + __free_page(page); return 0; } @@ -4212,20 +2707,15 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info) struct workqueue_struct *scrub_workers = fs_info->scrub_workers; struct workqueue_struct *scrub_wr_comp = fs_info->scrub_wr_completion_workers; - struct workqueue_struct *scrub_parity = - fs_info->scrub_parity_workers; fs_info->scrub_workers = NULL; fs_info->scrub_wr_completion_workers = NULL; - fs_info->scrub_parity_workers = NULL; mutex_unlock(&fs_info->scrub_lock); if (scrub_workers) destroy_workqueue(scrub_workers); if (scrub_wr_comp) destroy_workqueue(scrub_wr_comp); - if (scrub_parity) - destroy_workqueue(scrub_parity); } } @@ -4237,7 +2727,6 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, { struct workqueue_struct *scrub_workers = NULL; struct workqueue_struct *scrub_wr_comp = NULL; - struct workqueue_struct *scrub_parity = NULL; unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; int max_active = fs_info->thread_pool_size; int ret = -ENOMEM; @@ -4254,18 +2743,12 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, if (!scrub_wr_comp) goto fail_scrub_wr_completion_workers; - scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active); - if (!scrub_parity) - goto fail_scrub_parity_workers; - mutex_lock(&fs_info->scrub_lock); if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) { ASSERT(fs_info->scrub_workers == NULL && - fs_info->scrub_wr_completion_workers == NULL && - fs_info->scrub_parity_workers == NULL); + fs_info->scrub_wr_completion_workers == NULL); fs_info->scrub_workers = scrub_workers; fs_info->scrub_wr_completion_workers = scrub_wr_comp; - fs_info->scrub_parity_workers = scrub_parity; refcount_set(&fs_info->scrub_workers_refcnt, 1); mutex_unlock(&fs_info->scrub_lock); return 0; @@ -4275,8 +2758,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, mutex_unlock(&fs_info->scrub_lock); ret = 0; - destroy_workqueue(scrub_parity); -fail_scrub_parity_workers: + destroy_workqueue(scrub_wr_comp); fail_scrub_wr_completion_workers: destroy_workqueue(scrub_workers); @@ -4411,12 +2893,9 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, ret = scrub_enumerate_chunks(sctx, dev, start, end); memalloc_nofs_restore(nofs_flag); - wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); atomic_dec(&fs_info->scrubs_running); wake_up(&fs_info->scrub_pause_wait); - wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0); - if (progress) memcpy(progress, &sctx->stat, sizeof(*progress)); @@ -4541,28 +3020,3 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; } - -static void scrub_find_good_copy(struct btrfs_fs_info *fs_info, - u64 extent_logical, u32 extent_len, - u64 *extent_physical, - struct btrfs_device **extent_dev, - int *extent_mirror_num) -{ - u64 mapped_length; - struct btrfs_io_context *bioc = NULL; - int ret; - - mapped_length = extent_len; - ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical, - &mapped_length, &bioc, 0); - if (ret || !bioc || mapped_length < extent_len || - !bioc->stripes[0].dev->bdev) { - btrfs_put_bioc(bioc); - return; - } - - *extent_physical = bioc->stripes[0].physical; - *extent_mirror_num = bioc->mirror_num; - *extent_dev = bioc->stripes[0].dev; - btrfs_put_bioc(bioc); -} diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index e5c963bb873d..af2e153543a5 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1875,7 +1875,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, int left_ret; int right_ret; u64 left_gen; - u64 right_gen; + u64 right_gen = 0; struct btrfs_inode_info info; ret = get_inode_info(sctx->send_root, ino, &info); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 3eecce86f63f..75e7fa337e66 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -537,7 +537,7 @@ again: up_read(&info->groups_sem); } -static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, +static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info, u64 to_reclaim) { u64 bytes; @@ -550,6 +550,18 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, return nr; } +static inline u64 calc_delayed_refs_nr(const struct btrfs_fs_info *fs_info, + u64 to_reclaim) +{ + const u64 bytes = btrfs_calc_delayed_ref_bytes(fs_info, 1); + u64 nr; + + nr = div64_u64(to_reclaim, bytes); + if (!nr) + nr = 1; + return nr; +} + #define EXTENT_SIZE_PER_ITEM SZ_256K /* @@ -727,7 +739,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; } if (state == FLUSH_DELAYED_REFS_NR) - nr = calc_reclaim_items_nr(fs_info, num_bytes); + nr = calc_delayed_refs_nr(fs_info, num_bytes); else nr = 0; btrfs_run_delayed_refs(trans, nr); @@ -1599,11 +1611,22 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, struct reserve_ticket ticket; u64 start_ns = 0; u64 used; - int ret = 0; + int ret = -ENOSPC; bool pending_tickets; ASSERT(orig_bytes); - ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); + /* + * If have a transaction handle (current->journal_info != NULL), then + * the flush method can not be neither BTRFS_RESERVE_FLUSH_ALL* nor + * BTRFS_RESERVE_FLUSH_EVICT, as we could deadlock because those + * flushing methods can trigger transaction commits. + */ + if (current->journal_info) { + /* One assert per line for easier debugging. */ + ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL); + ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL_STEAL); + ASSERT(flush != BTRFS_RESERVE_FLUSH_EVICT); + } if (flush == BTRFS_RESERVE_FLUSH_DATA) async_work = &fs_info->async_data_reclaim_work; @@ -1611,7 +1634,6 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, async_work = &fs_info->async_reclaim_work; spin_lock(&space_info->lock); - ret = -ENOSPC; used = btrfs_space_info_used(space_info, true); /* diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 2033b71b18ce..0bb9d14e60a8 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -27,6 +27,7 @@ enum btrfs_reserve_flush_enum { * - Running delayed refs * - Running delalloc and waiting for ordered extents * - Allocating a new chunk + * - Committing transaction */ BTRFS_RESERVE_FLUSH_EVICT, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 366fb4cde145..6cb97efee976 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1158,6 +1158,7 @@ static int btrfs_fill_super(struct super_block *sb, inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root); if (IS_ERR(inode)) { err = PTR_ERR(inode); + btrfs_handle_fs_error(fs_info, err, NULL); goto fail_close; } @@ -2412,7 +2413,7 @@ static int __init btrfs_print_mod_info(void) ", fsverity=no" #endif ; - pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options); + pr_info("Btrfs loaded%s\n", options); return 0; } diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 37fc58a7f27e..25294e624851 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1262,8 +1262,13 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj, if (ret) return ret; +#ifdef CONFIG_BTRFS_DEBUG + if (thresh != 0 && (thresh > 100)) + return -EINVAL; +#else if (thresh != 0 && (thresh <= 50 || thresh > 100)) return -EINVAL; +#endif WRITE_ONCE(fs_info->bg_reclaim_threshold, thresh); diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index f2f2e11dac4c..ed0f36ae5346 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -486,7 +486,6 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, em->map_lookup = map; map->num_stripes = test->num_stripes; - map->stripe_len = BTRFS_STRIPE_LEN; map->type = test->raid_type; for (i = 0; i < map->num_stripes; i++) { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index b8d5b1fa9a03..8b6a99b8d7f6 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -601,15 +601,16 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, /* * We want to reserve all the bytes we may need all at once, so * we only do 1 enospc flushing cycle per transaction start. We - * accomplish this by simply assuming we'll do 2 x num_items - * worth of delayed refs updates in this trans handle, and - * refill that amount for whatever is missing in the reserve. + * accomplish this by simply assuming we'll do num_items worth + * of delayed refs updates in this trans handle, and refill that + * amount for whatever is missing in the reserve. */ num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items); if (flush == BTRFS_RESERVE_FLUSH_ALL && - btrfs_block_rsv_full(delayed_refs_rsv) == 0) { - delayed_refs_bytes = num_bytes; - num_bytes <<= 1; + !btrfs_block_rsv_full(delayed_refs_rsv)) { + delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info, + num_items); + num_bytes += delayed_refs_bytes; } /* @@ -942,16 +943,6 @@ void btrfs_throttle(struct btrfs_fs_info *fs_info) wait_current_trans(fs_info); } -static bool should_end_transaction(struct btrfs_trans_handle *trans) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - - if (btrfs_check_space_for_delayed_refs(fs_info)) - return true; - - return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 50); -} - bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_transaction *cur_trans = trans->transaction; @@ -960,7 +951,10 @@ bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans) test_bit(BTRFS_DELAYED_REFS_FLUSHING, &cur_trans->delayed_refs.flags)) return true; - return should_end_transaction(trans); + if (btrfs_check_space_for_delayed_refs(trans->fs_info)) + return true; + + return !!btrfs_block_rsv_check(&trans->fs_info->global_block_rsv, 50); } static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index baad1ed7e111..e2b54793bf0c 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -849,6 +849,20 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, stripe_len); return -EUCLEAN; } + /* + * We artificially limit the chunk size, so that the number of stripes + * inside a chunk can be fit into a U32. The current limit (256G) is + * way too large for real world usage anyway, and it's also much larger + * than our existing limit (10G). + * + * Thus it should be a good way to catch obvious bitflips. + */ + if (unlikely(length >= ((u64)U32_MAX << BTRFS_STRIPE_LEN_SHIFT))) { + chunk_err(leaf, chunk, logical, + "chunk length too large: have %llu limit %llu", + length, (u64)U32_MAX << BTRFS_STRIPE_LEN_SHIFT); + return -EUCLEAN; + } if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK))) { chunk_err(leaf, chunk, logical, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 200cea6e49e5..9b212e8c70cc 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2563,6 +2563,28 @@ static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) btrfs_put_block_group(cache); } +static int clean_log_buffer(struct btrfs_trans_handle *trans, + struct extent_buffer *eb) +{ + int ret; + + btrfs_tree_lock(eb); + btrfs_clear_buffer_dirty(trans, eb); + wait_on_extent_buffer_writeback(eb); + btrfs_tree_unlock(eb); + + if (trans) { + ret = btrfs_pin_reserved_extent(trans, eb->start, eb->len); + if (ret) + return ret; + btrfs_redirty_list_add(trans->transaction, eb); + } else { + unaccount_log_buffer(eb->fs_info, eb->start); + } + + return 0; +} + static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int *level, @@ -2573,7 +2595,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, u64 ptr_gen; struct extent_buffer *next; struct extent_buffer *cur; - u32 blocksize; int ret = 0; while (*level > 0) { @@ -2593,7 +2614,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, check.level = *level - 1; check.has_first_key = true; btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]); - blocksize = fs_info->nodesize; next = btrfs_find_create_tree_block(fs_info, bytenr, btrfs_header_owner(cur), @@ -2617,22 +2637,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, return ret; } - btrfs_tree_lock(next); - btrfs_clear_buffer_dirty(trans, next); - wait_on_extent_buffer_writeback(next); - btrfs_tree_unlock(next); - - if (trans) { - ret = btrfs_pin_reserved_extent(trans, - bytenr, blocksize); - if (ret) { - free_extent_buffer(next); - return ret; - } - btrfs_redirty_list_add( - trans->transaction, next); - } else { - unaccount_log_buffer(fs_info, bytenr); + ret = clean_log_buffer(trans, next); + if (ret) { + free_extent_buffer(next); + return ret; } } free_extent_buffer(next); @@ -2662,7 +2670,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, struct btrfs_path *path, int *level, struct walk_control *wc) { - struct btrfs_fs_info *fs_info = root->fs_info; int i; int slot; int ret; @@ -2682,27 +2689,9 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, return ret; if (wc->free) { - struct extent_buffer *next; - - next = path->nodes[*level]; - - btrfs_tree_lock(next); - btrfs_clear_buffer_dirty(trans, next); - wait_on_extent_buffer_writeback(next); - btrfs_tree_unlock(next); - - if (trans) { - ret = btrfs_pin_reserved_extent(trans, - path->nodes[*level]->start, - path->nodes[*level]->len); - if (ret) - return ret; - btrfs_redirty_list_add(trans->transaction, - next); - } else { - unaccount_log_buffer(fs_info, - path->nodes[*level]->start); - } + ret = clean_log_buffer(trans, path->nodes[*level]); + if (ret) + return ret; } free_extent_buffer(path->nodes[*level]); path->nodes[*level] = NULL; @@ -2720,7 +2709,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, static int walk_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct walk_control *wc) { - struct btrfs_fs_info *fs_info = log->fs_info; int ret = 0; int wret; int level; @@ -2762,26 +2750,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, orig_level); if (ret) goto out; - if (wc->free) { - struct extent_buffer *next; - - next = path->nodes[orig_level]; - - btrfs_tree_lock(next); - btrfs_clear_buffer_dirty(trans, next); - wait_on_extent_buffer_writeback(next); - btrfs_tree_unlock(next); - - if (trans) { - ret = btrfs_pin_reserved_extent(trans, - next->start, next->len); - if (ret) - goto out; - btrfs_redirty_list_add(trans->transaction, next); - } else { - unaccount_log_buffer(fs_info, next->start); - } - } + if (wc->free) + ret = clean_log_buffer(trans, path->nodes[orig_level]); } out: @@ -3648,6 +3618,9 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, ret = BTRFS_LOG_FORCE_COMMIT; else inode->last_dir_index_offset = last_index; + + if (btrfs_get_first_dir_index_to_log(inode) == 0) + btrfs_set_first_dir_index_to_log(inode, batch.keys[0].offset); out: kfree(ins_data); @@ -4099,7 +4072,7 @@ static int drop_inode_items(struct btrfs_trans_handle *trans, found_key.offset = 0; found_key.type = 0; - ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot); + ret = btrfs_bin_search(path->nodes[0], 0, &found_key, &start_slot); if (ret < 0) break; @@ -5406,6 +5379,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, LIST_HEAD(dir_list); struct btrfs_dir_list *dir_elem; u64 ino = btrfs_ino(start_inode); + struct btrfs_inode *curr_inode = start_inode; int ret = 0; /* @@ -5420,43 +5394,39 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; + /* Pairs with btrfs_add_delayed_iput below. */ + ihold(&curr_inode->vfs_inode); + while (true) { - struct extent_buffer *leaf; - struct btrfs_key min_key; + struct inode *vfs_inode; + struct btrfs_key key; + struct btrfs_key found_key; + u64 next_index; bool continue_curr_inode = true; - int nritems; - int i; + int iter_ret; - min_key.objectid = ino; - min_key.type = BTRFS_DIR_INDEX_KEY; - min_key.offset = 0; + key.objectid = ino; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = btrfs_get_first_dir_index_to_log(curr_inode); + next_index = key.offset; again: - btrfs_release_path(path); - ret = btrfs_search_forward(root, &min_key, path, trans->transid); - if (ret < 0) { - break; - } else if (ret > 0) { - ret = 0; - goto next; - } - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - for (i = path->slots[0]; i < nritems; i++) { + btrfs_for_each_slot(root->log_root, &key, &found_key, path, iter_ret) { + struct extent_buffer *leaf = path->nodes[0]; struct btrfs_dir_item *di; struct btrfs_key di_key; struct inode *di_inode; int log_mode = LOG_INODE_EXISTS; int type; - btrfs_item_key_to_cpu(leaf, &min_key, i); - if (min_key.objectid != ino || - min_key.type != BTRFS_DIR_INDEX_KEY) { + if (found_key.objectid != ino || + found_key.type != BTRFS_DIR_INDEX_KEY) { continue_curr_inode = false; break; } - di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); + next_index = found_key.offset + 1; + + di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); type = btrfs_dir_ftype(leaf, di); if (btrfs_dir_transid(leaf, di) < trans->transid) continue; @@ -5496,12 +5466,24 @@ again: break; } - if (continue_curr_inode && min_key.offset < (u64)-1) { - min_key.offset++; + btrfs_release_path(path); + + if (iter_ret < 0) { + ret = iter_ret; + goto out; + } else if (iter_ret > 0) { + continue_curr_inode = false; + } else { + key = found_key; + } + + if (continue_curr_inode && key.offset < (u64)-1) { + key.offset++; goto again; } -next: + btrfs_set_first_dir_index_to_log(curr_inode, next_index); + if (list_empty(&dir_list)) break; @@ -5509,9 +5491,22 @@ next: ino = dir_elem->ino; list_del(&dir_elem->list); kfree(dir_elem); + + btrfs_add_delayed_iput(curr_inode); + curr_inode = NULL; + + vfs_inode = btrfs_iget(fs_info->sb, ino, root); + if (IS_ERR(vfs_inode)) { + ret = PTR_ERR(vfs_inode); + break; + } + curr_inode = BTRFS_I(vfs_inode); } out: btrfs_free_path(path); + if (curr_inode) + btrfs_add_delayed_iput(curr_inode); + if (ret) { struct btrfs_dir_list *next; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c6d592870400..03f52e4a20aa 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -395,7 +395,6 @@ void btrfs_free_device(struct btrfs_device *device) { WARN_ON(!list_empty(&device->post_commit_list)); rcu_string_free(device->name); - extent_io_tree_release(&device->alloc_state); btrfs_destroy_dev_zone_info(device); kfree(device); } @@ -1150,10 +1149,10 @@ static void btrfs_close_one_device(struct btrfs_device *device) device->last_flush_error = 0; /* Verify the device is back in a pristine state */ - ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); - ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); - ASSERT(list_empty(&device->dev_alloc_list)); - ASSERT(list_empty(&device->post_commit_list)); + WARN_ON(test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); + WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); + WARN_ON(!list_empty(&device->dev_alloc_list)); + WARN_ON(!list_empty(&device->post_commit_list)); } static void close_fs_devices(struct btrfs_fs_devices *fs_devices) @@ -2618,7 +2617,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path struct block_device *bdev; struct super_block *sb = fs_info->sb; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - struct btrfs_fs_devices *seed_devices; + struct btrfs_fs_devices *seed_devices = NULL; u64 orig_super_total_bytes; u64 orig_super_num_devices; int ret = 0; @@ -5125,7 +5124,7 @@ static void init_alloc_chunk_ctl_policy_regular( /* We don't want a chunk larger than 10% of writable space */ ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10), ctl->max_chunk_size); - ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; + ctl->dev_extent_min = ctl->dev_stripes << BTRFS_STRIPE_LEN_SHIFT; } static void init_alloc_chunk_ctl_policy_zoned( @@ -5407,7 +5406,6 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, j * ctl->stripe_size; } } - map->stripe_len = BTRFS_STRIPE_LEN; map->io_align = BTRFS_STRIPE_LEN; map->io_width = BTRFS_STRIPE_LEN; map->type = type; @@ -5438,7 +5436,7 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, } write_unlock(&em_tree->lock); - block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size); + block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size); if (IS_ERR(block_group)) goto error_del_extent; @@ -5615,11 +5613,11 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, btrfs_set_stack_chunk_length(chunk, bg->length); btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID); - btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); + btrfs_set_stack_chunk_stripe_len(chunk, BTRFS_STRIPE_LEN); btrfs_set_stack_chunk_type(chunk, map->type); btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); - btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); - btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); + btrfs_set_stack_chunk_io_align(chunk, BTRFS_STRIPE_LEN); + btrfs_set_stack_chunk_io_width(chunk, BTRFS_STRIPE_LEN); btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); @@ -5784,13 +5782,6 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) */ ret = map->num_stripes; free_extent_map(em); - - down_read(&fs_info->dev_replace.rwsem); - if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && - fs_info->dev_replace.tgtdev) - ret++; - up_read(&fs_info->dev_replace.rwsem); - return ret; } @@ -5809,7 +5800,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, if (!WARN_ON(IS_ERR(em))) { map = em->map_lookup; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) - len = map->stripe_len * nr_data_stripes(map); + len = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT; free_extent_map(em); } return len; @@ -5895,41 +5886,16 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, return preferred_mirror; } -/* Bubble-sort the stripe set to put the parity/syndrome stripes last */ -static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes) -{ - int i; - int again = 1; - - while (again) { - again = 0; - for (i = 0; i < num_stripes - 1; i++) { - /* Swap if parity is on a smaller index */ - if (bioc->raid_map[i] > bioc->raid_map[i + 1]) { - swap(bioc->stripes[i], bioc->stripes[i + 1]); - swap(bioc->raid_map[i], bioc->raid_map[i + 1]); - again = 1; - } - } - } -} - static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, - int total_stripes, - int real_stripes) + u16 total_stripes) { - struct btrfs_io_context *bioc = kzalloc( + struct btrfs_io_context *bioc; + + bioc = kzalloc( /* The size of btrfs_io_context */ sizeof(struct btrfs_io_context) + /* Plus the variable array for the stripes */ - sizeof(struct btrfs_io_stripe) * (total_stripes) + - /* Plus the variable array for the tgt dev */ - sizeof(int) * (real_stripes) + - /* - * Plus the raid_map, which includes both the tgt dev - * and the stripes. - */ - sizeof(u64) * (total_stripes), + sizeof(struct btrfs_io_stripe) * (total_stripes), GFP_NOFS); if (!bioc) @@ -5938,8 +5904,8 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_ refcount_set(&bioc->refs, 1); bioc->fs_info = fs_info; - bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes); - bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes); + bioc->replace_stripe_src = -1; + bioc->full_stripe_logical = (u64)-1; return bioc; } @@ -5971,16 +5937,15 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, struct btrfs_discard_stripe *stripes; u64 length = *length_ret; u64 offset; - u64 stripe_nr; - u64 stripe_nr_end; + u32 stripe_nr; + u32 stripe_nr_end; + u32 stripe_cnt; u64 stripe_end_offset; - u64 stripe_cnt; - u64 stripe_len; u64 stripe_offset; u32 stripe_index; u32 factor = 0; u32 sub_stripes = 0; - u64 stripes_per_dev = 0; + u32 stripes_per_dev = 0; u32 remaining_stripes = 0; u32 last_stripe = 0; int ret; @@ -5996,26 +5961,25 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ret = -EOPNOTSUPP; goto out_free_map; -} + } offset = logical - em->start; length = min_t(u64, em->start + em->len - logical, length); *length_ret = length; - stripe_len = map->stripe_len; /* * stripe_nr counts the total number of stripes we have to stride * to get to this block */ - stripe_nr = div64_u64(offset, stripe_len); + stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; /* stripe_offset is the offset of this block in its stripe */ - stripe_offset = offset - stripe_nr * stripe_len; + stripe_offset = offset - (stripe_nr << BTRFS_STRIPE_LEN_SHIFT); - stripe_nr_end = round_up(offset + length, map->stripe_len); - stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len); + stripe_nr_end = round_up(offset + length, BTRFS_STRIPE_LEN) >> + BTRFS_STRIPE_LEN_SHIFT; stripe_cnt = stripe_nr_end - stripe_nr; - stripe_end_offset = stripe_nr_end * map->stripe_len - + stripe_end_offset = (stripe_nr_end << BTRFS_STRIPE_LEN_SHIFT) - (offset + length); /* * after this, stripe_nr is the number of stripes on this @@ -6034,18 +5998,19 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, factor = map->num_stripes / sub_stripes; *num_stripes = min_t(u64, map->num_stripes, sub_stripes * stripe_cnt); - stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); + stripe_index = stripe_nr % factor; + stripe_nr /= factor; stripe_index *= sub_stripes; - stripes_per_dev = div_u64_rem(stripe_cnt, factor, - &remaining_stripes); - div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); - last_stripe *= sub_stripes; + + remaining_stripes = stripe_cnt % factor; + stripes_per_dev = stripe_cnt / factor; + last_stripe = ((stripe_nr_end - 1) % factor) * sub_stripes; } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_DUP)) { *num_stripes = map->num_stripes; } else { - stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, - &stripe_index); + stripe_index = stripe_nr % map->num_stripes; + stripe_nr /= map->num_stripes; } stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS); @@ -6057,15 +6022,15 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, for (i = 0; i < *num_stripes; i++) { stripes[i].physical = map->stripes[stripe_index].physical + - stripe_offset + stripe_nr * map->stripe_len; + stripe_offset + (stripe_nr << BTRFS_STRIPE_LEN_SHIFT); stripes[i].dev = map->stripes[stripe_index].dev; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { - stripes[i].length = stripes_per_dev * map->stripe_len; + stripes[i].length = stripes_per_dev << BTRFS_STRIPE_LEN_SHIFT; if (i / sub_stripes < remaining_stripes) - stripes[i].length += map->stripe_len; + stripes[i].length += BTRFS_STRIPE_LEN; /* * Special for the first stripe and @@ -6103,83 +6068,6 @@ out_free_map: return ERR_PTR(ret); } -/* - * In dev-replace case, for repair case (that's the only case where the mirror - * is selected explicitly when calling btrfs_map_block), blocks left of the - * left cursor can also be read from the target drive. - * - * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the - * array of stripes. - * For READ, it also needs to be supported using the same mirror number. - * - * If the requested block is not left of the left cursor, EIO is returned. This - * can happen because btrfs_num_copies() returns one more in the dev-replace - * case. - */ -static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, - u64 logical, u64 length, - u64 srcdev_devid, int *mirror_num, - u64 *physical) -{ - struct btrfs_io_context *bioc = NULL; - int num_stripes; - int index_srcdev = 0; - int found = 0; - u64 physical_of_found = 0; - int i; - int ret = 0; - - ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, - logical, &length, &bioc, NULL, NULL, 0); - if (ret) { - ASSERT(bioc == NULL); - return ret; - } - - num_stripes = bioc->num_stripes; - if (*mirror_num > num_stripes) { - /* - * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, - * that means that the requested area is not left of the left - * cursor - */ - btrfs_put_bioc(bioc); - return -EIO; - } - - /* - * process the rest of the function using the mirror_num of the source - * drive. Therefore look it up first. At the end, patch the device - * pointer to the one of the target drive. - */ - for (i = 0; i < num_stripes; i++) { - if (bioc->stripes[i].dev->devid != srcdev_devid) - continue; - - /* - * In case of DUP, in order to keep it simple, only add the - * mirror with the lowest physical address - */ - if (found && - physical_of_found <= bioc->stripes[i].physical) - continue; - - index_srcdev = i; - found = 1; - physical_of_found = bioc->stripes[i].physical; - } - - btrfs_put_bioc(bioc); - - ASSERT(found); - if (!found) - return -EIO; - - *mirror_num = index_srcdev + 1; - *physical = physical_of_found; - return ret; -} - static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) { struct btrfs_block_group *cache; @@ -6198,101 +6086,80 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) } static void handle_ops_on_dev_replace(enum btrfs_map_op op, - struct btrfs_io_context **bioc_ret, + struct btrfs_io_context *bioc, struct btrfs_dev_replace *dev_replace, u64 logical, int *num_stripes_ret, int *max_errors_ret) { - struct btrfs_io_context *bioc = *bioc_ret; u64 srcdev_devid = dev_replace->srcdev->devid; - int tgtdev_indexes = 0; + /* + * At this stage, num_stripes is still the real number of stripes, + * excluding the duplicated stripes. + */ int num_stripes = *num_stripes_ret; + int nr_extra_stripes = 0; int max_errors = *max_errors_ret; int i; - if (op == BTRFS_MAP_WRITE) { - int index_where_to_add; + /* + * A block group which has "to_copy" set will eventually be copied by + * the dev-replace process. We can avoid cloning IO here. + */ + if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical)) + return; - /* - * A block group which have "to_copy" set will eventually - * copied by dev-replace process. We can avoid cloning IO here. - */ - if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical)) - return; + /* + * Duplicate the write operations while the dev-replace procedure is + * running. Since the copying of the old disk to the new disk takes + * place at run time while the filesystem is mounted writable, the + * regular write operations to the old disk have to be duplicated to go + * to the new disk as well. + * + * Note that device->missing is handled by the caller, and that the + * write to the old disk is already set up in the stripes array. + */ + for (i = 0; i < num_stripes; i++) { + struct btrfs_io_stripe *old = &bioc->stripes[i]; + struct btrfs_io_stripe *new = &bioc->stripes[num_stripes + nr_extra_stripes]; - /* - * duplicate the write operations while the dev replace - * procedure is running. Since the copying of the old disk to - * the new disk takes place at run time while the filesystem is - * mounted writable, the regular write operations to the old - * disk have to be duplicated to go to the new disk as well. - * - * Note that device->missing is handled by the caller, and that - * the write to the old disk is already set up in the stripes - * array. - */ - index_where_to_add = num_stripes; - for (i = 0; i < num_stripes; i++) { - if (bioc->stripes[i].dev->devid == srcdev_devid) { - /* write to new disk, too */ - struct btrfs_io_stripe *new = - bioc->stripes + index_where_to_add; - struct btrfs_io_stripe *old = - bioc->stripes + i; - - new->physical = old->physical; - new->dev = dev_replace->tgtdev; - bioc->tgtdev_map[i] = index_where_to_add; - index_where_to_add++; - max_errors++; - tgtdev_indexes++; - } - } - num_stripes = index_where_to_add; - } else if (op == BTRFS_MAP_GET_READ_MIRRORS) { - int index_srcdev = 0; - int found = 0; - u64 physical_of_found = 0; + if (old->dev->devid != srcdev_devid) + continue; - /* - * During the dev-replace procedure, the target drive can also - * be used to read data in case it is needed to repair a corrupt - * block elsewhere. This is possible if the requested area is - * left of the left cursor. In this area, the target drive is a - * full copy of the source drive. - */ - for (i = 0; i < num_stripes; i++) { - if (bioc->stripes[i].dev->devid == srcdev_devid) { - /* - * In case of DUP, in order to keep it simple, - * only add the mirror with the lowest physical - * address - */ - if (found && - physical_of_found <= bioc->stripes[i].physical) - continue; - index_srcdev = i; - found = 1; - physical_of_found = bioc->stripes[i].physical; - } - } - if (found) { - struct btrfs_io_stripe *tgtdev_stripe = - bioc->stripes + num_stripes; + new->physical = old->physical; + new->dev = dev_replace->tgtdev; + if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) + bioc->replace_stripe_src = i; + nr_extra_stripes++; + } + + /* We can only have at most 2 extra nr_stripes (for DUP). */ + ASSERT(nr_extra_stripes <= 2); + /* + * For GET_READ_MIRRORS, we can only return at most 1 extra stripe for + * replace. + * If we have 2 extra stripes, only choose the one with smaller physical. + */ + if (op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) { + struct btrfs_io_stripe *first = &bioc->stripes[num_stripes]; + struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1]; - tgtdev_stripe->physical = physical_of_found; - tgtdev_stripe->dev = dev_replace->tgtdev; - bioc->tgtdev_map[index_srcdev] = num_stripes; + /* Only DUP can have two extra stripes. */ + ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP); - tgtdev_indexes++; - num_stripes++; + /* + * Swap the last stripe stripes and reduce @nr_extra_stripes. + * The extra stripe would still be there, but won't be accessed. + */ + if (first->physical > second->physical) { + swap(second->physical, first->physical); + swap(second->dev, first->dev); + nr_extra_stripes--; } } - *num_stripes_ret = num_stripes; - *max_errors_ret = max_errors; - bioc->num_tgtdevs = tgtdev_indexes; - *bioc_ret = bioc; + *num_stripes_ret = num_stripes + nr_extra_stripes; + *max_errors_ret = max_errors + nr_extra_stripes; + bioc->replace_nr_stripes = nr_extra_stripes; } static bool need_full_stripe(enum btrfs_map_op op) @@ -6301,25 +6168,35 @@ static bool need_full_stripe(enum btrfs_map_op op) } static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, - u64 offset, u64 *stripe_nr, u64 *stripe_offset, + u64 offset, u32 *stripe_nr, u64 *stripe_offset, u64 *full_stripe_start) { - u32 stripe_len = map->stripe_len; - ASSERT(op != BTRFS_MAP_DISCARD); /* * Stripe_nr is the stripe where this block falls. stripe_offset is * the offset of this block in its stripe. */ - *stripe_nr = div64_u64_rem(offset, stripe_len, stripe_offset); + *stripe_offset = offset & BTRFS_STRIPE_LEN_MASK; + *stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; ASSERT(*stripe_offset < U32_MAX); if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); + unsigned long full_stripe_len = nr_data_stripes(map) << + BTRFS_STRIPE_LEN_SHIFT; + /* + * For full stripe start, we use previously calculated + * @stripe_nr. Align it to nr_data_stripes, then multiply with + * STRIPE_LEN. + * + * By this we can avoid u64 division completely. And we have + * to go rounddown(), not round_down(), as nr_data_stripes is + * not ensured to be power of 2. + */ *full_stripe_start = - div64_u64(offset, full_stripe_len) * full_stripe_len; + rounddown(*stripe_nr, nr_data_stripes(map)) << + BTRFS_STRIPE_LEN_SHIFT; /* * For writes to RAID56, allow to write a full stripe set, but @@ -6334,16 +6211,16 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, * a single disk). */ if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) - return stripe_len - *stripe_offset; + return BTRFS_STRIPE_LEN - *stripe_offset; return U64_MAX; } static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map, - u32 stripe_index, u64 stripe_offset, u64 stripe_nr) + u32 stripe_index, u64 stripe_offset, u32 stripe_nr) { dst->dev = map->stripes[stripe_index].dev; dst->physical = map->stripes[stripe_index].physical + - stripe_offset + stripe_nr * map->stripe_len; + stripe_offset + (stripe_nr << BTRFS_STRIPE_LEN_SHIFT); } int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, @@ -6356,35 +6233,35 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, struct map_lookup *map; u64 map_offset; u64 stripe_offset; - u64 stripe_nr; - u64 stripe_len; + u32 stripe_nr; u32 stripe_index; int data_stripes; int i; int ret = 0; int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0); int num_stripes; + int num_copies; int max_errors = 0; - int tgtdev_indexes = 0; struct btrfs_io_context *bioc = NULL; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; int dev_replace_is_ongoing = 0; - int num_alloc_stripes; - int patch_the_first_stripe_for_dev_replace = 0; - u64 physical_to_patch_in_first_stripe = 0; + u16 num_alloc_stripes; u64 raid56_full_stripe_start = (u64)-1; u64 max_len; ASSERT(bioc_ret); ASSERT(op != BTRFS_MAP_DISCARD); + num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize); + if (mirror_num > num_copies) + return -EINVAL; + em = btrfs_get_chunk_map(fs_info, logical, *length); if (IS_ERR(em)) return PTR_ERR(em); map = em->map_lookup; data_stripes = nr_data_stripes(map); - stripe_len = map->stripe_len; map_offset = logical - em->start; max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr, @@ -6400,25 +6277,11 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, if (!dev_replace_is_ongoing) up_read(&dev_replace->rwsem); - if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && - !need_full_stripe(op) && dev_replace->tgtdev != NULL) { - ret = get_extra_mirror_from_replace(fs_info, logical, *length, - dev_replace->srcdev->devid, - &mirror_num, - &physical_to_patch_in_first_stripe); - if (ret) - goto out; - else - patch_the_first_stripe_for_dev_replace = 1; - } else if (mirror_num > map->num_stripes) { - mirror_num = 0; - } - num_stripes = 1; stripe_index = 0; if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, - &stripe_index); + stripe_index = stripe_nr % map->num_stripes; + stripe_nr /= map->num_stripes; if (!need_full_stripe(op)) mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { @@ -6444,8 +6307,8 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { u32 factor = map->num_stripes / map->sub_stripes; - stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); - stripe_index *= map->sub_stripes; + stripe_index = (stripe_nr % factor) * map->sub_stripes; + stripe_nr /= factor; if (need_full_stripe(op)) num_stripes = map->sub_stripes; @@ -6460,11 +6323,17 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, } } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - ASSERT(map->stripe_len == BTRFS_STRIPE_LEN); if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { - /* push stripe_nr back to the start of the full stripe */ - stripe_nr = div64_u64(raid56_full_stripe_start, - stripe_len * data_stripes); + /* + * Push stripe_nr back to the start of the full stripe + * For those cases needing a full stripe, @stripe_nr + * is the full stripe number. + * + * Originally we go raid56_full_stripe_start / full_stripe_len, + * but that can be expensive. Here we just divide + * @stripe_nr with @data_stripes. + */ + stripe_nr /= data_stripes; /* RAID[56] write or recovery. Return all stripes */ num_stripes = map->num_stripes; @@ -6473,7 +6342,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, /* Return the length to the full stripe end */ *length = min(logical + *length, raid56_full_stripe_start + em->start + - data_stripes * stripe_len) - logical; + (data_stripes << BTRFS_STRIPE_LEN_SHIFT)) - logical; stripe_index = 0; stripe_offset = 0; } else { @@ -6482,25 +6351,24 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * Mirror #2 is RAID5 parity block. * Mirror #3 is RAID6 Q block. */ - stripe_nr = div_u64_rem(stripe_nr, - data_stripes, &stripe_index); + stripe_index = stripe_nr % data_stripes; + stripe_nr /= data_stripes; if (mirror_num > 1) stripe_index = data_stripes + mirror_num - 2; /* We distribute the parity blocks across stripes */ - div_u64_rem(stripe_nr + stripe_index, map->num_stripes, - &stripe_index); + stripe_index = (stripe_nr + stripe_index) % map->num_stripes; if (!need_full_stripe(op) && mirror_num <= 1) mirror_num = 1; } } else { /* - * after this, stripe_nr is the number of stripes on this + * After this, stripe_nr is the number of stripes on this * device we have to walk to find the data, and stripe_index is * the number of our device in the stripe array */ - stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, - &stripe_index); + stripe_index = stripe_nr % map->num_stripes; + stripe_nr /= map->num_stripes; mirror_num = stripe_index + 1; } if (stripe_index >= map->num_stripes) { @@ -6512,13 +6380,16 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, } num_alloc_stripes = num_stripes; - if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) { - if (op == BTRFS_MAP_WRITE) - num_alloc_stripes <<= 1; - if (op == BTRFS_MAP_GET_READ_MIRRORS) - num_alloc_stripes++; - tgtdev_indexes = num_stripes; - } + if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && + op != BTRFS_MAP_READ) + /* + * For replace case, we need to add extra stripes for extra + * duplicated stripes. + * + * For both WRITE and GET_READ_MIRRORS, we may have at most + * 2 more stripes (DUP types, otherwise 1). + */ + num_alloc_stripes += 2; /* * If this I/O maps to a single device, try to return the device and @@ -6529,53 +6400,53 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) && (!need_full_stripe(op) || !dev_replace_is_ongoing || !dev_replace->tgtdev)) { - if (patch_the_first_stripe_for_dev_replace) { - smap->dev = dev_replace->tgtdev; - smap->physical = physical_to_patch_in_first_stripe; - *mirror_num_ret = map->num_stripes + 1; - } else { - set_io_stripe(smap, map, stripe_index, stripe_offset, - stripe_nr); - *mirror_num_ret = mirror_num; - } + set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr); + *mirror_num_ret = mirror_num; *bioc_ret = NULL; ret = 0; goto out; } - bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes); + bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes); if (!bioc) { ret = -ENOMEM; goto out; } + bioc->map_type = map->type; - for (i = 0; i < num_stripes; i++) { - set_io_stripe(&bioc->stripes[i], map, stripe_index, stripe_offset, - stripe_nr); - stripe_index++; - } - - /* Build raid_map */ + /* + * For RAID56 full map, we need to make sure the stripes[] follows the + * rule that data stripes are all ordered, then followed with P and Q + * (if we have). + * + * It's still mostly the same as other profiles, just with extra rotation. + */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { - u64 tmp; - unsigned rot; - - /* Work out the disk rotation on this stripe-set */ - div_u64_rem(stripe_nr, num_stripes, &rot); - - /* Fill in the logical address of each stripe */ - tmp = stripe_nr * data_stripes; - for (i = 0; i < data_stripes; i++) - bioc->raid_map[(i + rot) % num_stripes] = - em->start + (tmp + i) * map->stripe_len; - - bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE; - if (map->type & BTRFS_BLOCK_GROUP_RAID6) - bioc->raid_map[(i + rot + 1) % num_stripes] = - RAID6_Q_STRIPE; - - sort_parity_stripes(bioc, num_stripes); + /* + * For RAID56 @stripe_nr is already the number of full stripes + * before us, which is also the rotation value (needs to modulo + * with num_stripes). + * + * In this case, we just add @stripe_nr with @i, then do the + * modulo, to reduce one modulo call. + */ + bioc->full_stripe_logical = em->start + + ((stripe_nr * data_stripes) << BTRFS_STRIPE_LEN_SHIFT); + for (i = 0; i < num_stripes; i++) + set_io_stripe(&bioc->stripes[i], map, + (i + stripe_nr) % num_stripes, + stripe_offset, stripe_nr); + } else { + /* + * For all other non-RAID56 profiles, just copy the target + * stripe into the bioc. + */ + for (i = 0; i < num_stripes; i++) { + set_io_stripe(&bioc->stripes[i], map, stripe_index, + stripe_offset, stripe_nr); + stripe_index++; + } } if (need_full_stripe(op)) @@ -6583,27 +6454,15 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && need_full_stripe(op)) { - handle_ops_on_dev_replace(op, &bioc, dev_replace, logical, + handle_ops_on_dev_replace(op, bioc, dev_replace, logical, &num_stripes, &max_errors); } *bioc_ret = bioc; - bioc->map_type = map->type; bioc->num_stripes = num_stripes; bioc->max_errors = max_errors; bioc->mirror_num = mirror_num; - /* - * this is the case that REQ_READ && dev_replace_is_ongoing && - * mirror_num == num_stripes + 1 && dev_replace target drive is - * available as a mirror - */ - if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { - WARN_ON(num_stripes > 1); - bioc->stripes[0].dev = dev_replace->tgtdev; - bioc->stripes[0].physical = physical_to_patch_in_first_stripe; - bioc->mirror_num = map->num_stripes + 1; - } out: if (dev_replace_is_ongoing) { lockdep_assert_held(&dev_replace->rwsem); @@ -6941,7 +6800,6 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, map->num_stripes = num_stripes; map->io_width = btrfs_chunk_io_width(leaf, chunk); map->io_align = btrfs_chunk_io_align(leaf, chunk); - map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); map->type = type; /* * We can't use the sub_stripes value, as for profiles other than @@ -8161,3 +8019,76 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) return true; } + +static void map_raid56_repair_block(struct btrfs_io_context *bioc, + struct btrfs_io_stripe *smap, + u64 logical) +{ + int data_stripes = nr_bioc_data_stripes(bioc); + int i; + + for (i = 0; i < data_stripes; i++) { + u64 stripe_start = bioc->full_stripe_logical + + (i << BTRFS_STRIPE_LEN_SHIFT); + + if (logical >= stripe_start && + logical < stripe_start + BTRFS_STRIPE_LEN) + break; + } + ASSERT(i < data_stripes); + smap->dev = bioc->stripes[i].dev; + smap->physical = bioc->stripes[i].physical + + ((logical - bioc->full_stripe_logical) & + BTRFS_STRIPE_LEN_MASK); +} + +/* + * Map a repair write into a single device. + * + * A repair write is triggered by read time repair or scrub, which would only + * update the contents of a single device. + * Not update any other mirrors nor go through RMW path. + * + * Callers should ensure: + * + * - Call btrfs_bio_counter_inc_blocked() first + * - The range does not cross stripe boundary + * - Has a valid @mirror_num passed in. + */ +int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, + struct btrfs_io_stripe *smap, u64 logical, + u32 length, int mirror_num) +{ + struct btrfs_io_context *bioc = NULL; + u64 map_length = length; + int mirror_ret = mirror_num; + int ret; + + ASSERT(mirror_num > 0); + + ret = __btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, + &bioc, smap, &mirror_ret, true); + if (ret < 0) + return ret; + + /* The map range should not cross stripe boundary. */ + ASSERT(map_length >= length); + + /* Already mapped to single stripe. */ + if (!bioc) + goto out; + + /* Map the RAID56 multi-stripe writes to a single one. */ + if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + map_raid56_repair_block(bioc, smap, logical); + goto out; + } + + ASSERT(mirror_num <= bioc->num_stripes); + smap->dev = bioc->stripes[mirror_num - 1].dev; + smap->physical = bioc->stripes[mirror_num - 1].physical; +out: + btrfs_put_bioc(bioc); + ASSERT(smap->dev); + return 0; +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 7e51f2238f72..bf47a1a70813 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -17,7 +17,11 @@ extern struct mutex uuid_mutex; -#define BTRFS_STRIPE_LEN SZ_64K +#define BTRFS_STRIPE_LEN SZ_64K +#define BTRFS_STRIPE_LEN_SHIFT (16) +#define BTRFS_STRIPE_LEN_MASK (BTRFS_STRIPE_LEN - 1) + +static_assert(const_ilog2(BTRFS_STRIPE_LEN) == BTRFS_STRIPE_LEN_SHIFT); /* Used by sanity check for btrfs_raid_types. */ #define const_ffs(n) (__builtin_ctzll(n) + 1) @@ -404,17 +408,74 @@ struct btrfs_io_context { u64 map_type; /* get from map_lookup->type */ struct bio *orig_bio; atomic_t error; - int max_errors; - int num_stripes; - int mirror_num; - int num_tgtdevs; - int *tgtdev_map; + u16 max_errors; + + /* + * The total number of stripes, including the extra duplicated + * stripe for replace. + */ + u16 num_stripes; + + /* + * The mirror_num of this bioc. + * + * This is for reads which use 0 as mirror_num, thus we should return a + * valid mirror_num (>0) for the reader. + */ + u16 mirror_num; + + /* + * The following two members are for dev-replace case only. + * + * @replace_nr_stripes: Number of duplicated stripes which need to be + * written to replace target. + * Should be <= 2 (2 for DUP, otherwise <= 1). + * @replace_stripe_src: The array indicates where the duplicated stripes + * are from. + * + * The @replace_stripe_src[] array is mostly for RAID56 cases. + * As non-RAID56 stripes share the same contents of the mapped range, + * thus no need to bother where the duplicated ones are from. + * + * But for RAID56 case, all stripes contain different contents, thus + * we need a way to know the mapping. + * + * There is an example for the two members, using a RAID5 write: + * + * num_stripes: 4 (3 + 1 duplicated write) + * stripes[0]: dev = devid 1, physical = X + * stripes[1]: dev = devid 2, physical = Y + * stripes[2]: dev = devid 3, physical = Z + * stripes[3]: dev = devid 0, physical = Y + * + * replace_nr_stripes = 1 + * replace_stripe_src = 1 <- Means stripes[1] is involved in replace. + * The duplicated stripe index would be + * (@num_stripes - 1). + * + * Note, that we can still have cases replace_nr_stripes = 2 for DUP. + * In that case, all stripes share the same content, thus we don't + * need to bother @replace_stripe_src value at all. + */ + u16 replace_nr_stripes; + s16 replace_stripe_src; /* - * logical block numbers for the start of each stripe - * The last one or two are p/q. These are sorted, - * so raid_map[0] is the start of our full stripe + * Logical bytenr of the full stripe start, only for RAID56 cases. + * + * When this value is set to other than (u64)-1, the stripes[] should + * follow this pattern: + * + * (real_stripes = num_stripes - replace_nr_stripes) + * (data_stripes = (is_raid6) ? (real_stripes - 2) : (real_stripes - 1)) + * + * stripes[0]: The first data stripe + * stripes[1]: The second data stripe + * ... + * stripes[data_stripes - 1]: The last data stripe + * stripes[data_stripes]: The P stripe + * stripes[data_stripes + 1]: The Q stripe (only for RAID6). */ - u64 *raid_map; + u64 full_stripe_logical; struct btrfs_io_stripe stripes[]; }; @@ -446,7 +507,6 @@ struct map_lookup { u64 type; int io_align; int io_width; - u32 stripe_len; int num_stripes; int sub_stripes; int verified_stripes; /* For mount time dev extent verification */ @@ -527,6 +587,9 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, struct btrfs_io_context **bioc_ret, struct btrfs_io_stripe *smap, int *mirror_num_ret, int need_raid_map); +int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, + struct btrfs_io_stripe *smap, u64 logical, + u32 length, int mirror_num); struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, u64 logical, u64 *length_ret, u32 *num_stripes); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index da7bb9187b68..8acb05e176c5 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -350,8 +350,6 @@ done: zlib_inflateEnd(&workspace->strm); if (data_in) kunmap_local(data_in); - if (!ret) - zero_fill_bio(cb->orig_bio); return ret; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 45d04092f2f8..a9b32ba6b2ce 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1640,14 +1640,14 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio) { u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT); struct btrfs_inode *inode = bbio->inode; - struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_fs_info *fs_info = bbio->fs_info; struct btrfs_block_group *cache; bool ret = false; if (!btrfs_is_zoned(fs_info)) return false; - if (!is_data_inode(&inode->vfs_inode)) + if (!inode || !is_data_inode(&inode->vfs_inode)) return false; if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index e34f1ab99d56..f798da267590 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -609,7 +609,6 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) } } ret = 0; - zero_fill_bio(cb->orig_bio); done: if (workspace->in_buf.src) kunmap_local(workspace->in_buf.src); |