summaryrefslogtreecommitdiff
path: root/fs/btrfs/scrub.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/scrub.c')
-rw-r--r--fs/btrfs/scrub.c159
1 files changed, 111 insertions, 48 deletions
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 485cda3eb8d7..088641ba7a8e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -165,6 +165,10 @@ struct scrub_ctx {
int readonly;
int pages_per_rd_bio;
+ /* State of IO submission throttling affecting the associated device */
+ ktime_t throttle_deadline;
+ u64 throttle_sent;
+
int is_dev_replace;
u64 write_pointer;
@@ -605,6 +609,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
spin_lock_init(&sctx->list_lock);
spin_lock_init(&sctx->stat_lock);
init_waitqueue_head(&sctx->list_wait);
+ sctx->throttle_deadline = 0;
WARN_ON(sctx->wr_curr_bio != NULL);
mutex_init(&sctx->wr_lock);
@@ -626,7 +631,6 @@ nomem:
static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
void *warn_ctx)
{
- u64 isize;
u32 nlink;
int ret;
int i;
@@ -662,7 +666,6 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
eb = swarn->path->nodes[0];
inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
struct btrfs_inode_item);
- isize = btrfs_inode_size(eb, inode_item);
nlink = btrfs_inode_nlink(eb, inode_item);
btrfs_release_path(swarn->path);
@@ -691,12 +694,12 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
*/
for (i = 0; i < ipath->fspath->elem_cnt; ++i)
btrfs_warn_in_rcu(fs_info,
-"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)",
+"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
swarn->errstr, swarn->logical,
rcu_str_deref(swarn->dev->name),
swarn->physical,
root, inum, offset,
- min(isize - offset, (u64)PAGE_SIZE), nlink,
+ fs_info->sectorsize, nlink,
(char *)(unsigned long)ipath->fspath->val[i]);
btrfs_put_root(local_root);
@@ -885,25 +888,25 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
* read all mirrors one after the other. This includes to
* re-read the extent or metadata block that failed (that was
* the cause that this fixup code is called) another time,
- * page by page this time in order to know which pages
+ * sector by sector this time in order to know which sectors
* caused I/O errors and which ones are good (for all mirrors).
* It is the goal to handle the situation when more than one
* mirror contains I/O errors, but the errors do not
* overlap, i.e. the data can be repaired by selecting the
- * pages from those mirrors without I/O error on the
- * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
- * would be that mirror #1 has an I/O error on the first page,
- * the second page is good, and mirror #2 has an I/O error on
- * the second page, but the first page is good.
- * Then the first page of the first mirror can be repaired by
- * taking the first page of the second mirror, and the
- * second page of the second mirror can be repaired by
- * copying the contents of the 2nd page of the 1st mirror.
- * One more note: if the pages of one mirror contain I/O
+ * sectors from those mirrors without I/O error on the
+ * particular sectors. One example (with blocks >= 2 * sectorsize)
+ * would be that mirror #1 has an I/O error on the first sector,
+ * the second sector is good, and mirror #2 has an I/O error on
+ * the second sector, but the first sector is good.
+ * Then the first sector of the first mirror can be repaired by
+ * taking the first sector of the second mirror, and the
+ * second sector of the second mirror can be repaired by
+ * copying the contents of the 2nd sector of the 1st mirror.
+ * One more note: if the sectors of one mirror contain I/O
* errors, the checksum cannot be verified. In order to get
* the best data for repairing, the first attempt is to find
* a mirror without I/O errors and with a validated checksum.
- * Only if this is not possible, the pages are picked from
+ * Only if this is not possible, the sectors are picked from
* mirrors with I/O errors without considering the checksum.
* If the latter is the case, at the end, the checksum of the
* repaired area is verified in order to correctly maintain
@@ -1060,26 +1063,26 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
/*
* In case of I/O errors in the area that is supposed to be
- * repaired, continue by picking good copies of those pages.
- * Select the good pages from mirrors to rewrite bad pages from
+ * repaired, continue by picking good copies of those sectors.
+ * Select the good sectors from mirrors to rewrite bad sectors from
* the area to fix. Afterwards verify the checksum of the block
* that is supposed to be repaired. This verification step is
* only done for the purpose of statistic counting and for the
* final scrub report, whether errors remain.
* A perfect algorithm could make use of the checksum and try
- * all possible combinations of pages from the different mirrors
+ * all possible combinations of sectors from the different mirrors
* until the checksum verification succeeds. For example, when
- * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
+ * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector
* of mirror #2 is readable but the final checksum test fails,
- * then the 2nd page of mirror #3 could be tried, whether now
+ * then the 2nd sector of mirror #3 could be tried, whether now
* the final checksum succeeds. But this would be a rare
* exception and is therefore not implemented. At least it is
* avoided that the good copy is overwritten.
* A more useful improvement would be to pick the sectors
* without I/O error based on sector sizes (512 bytes on legacy
- * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
+ * disks) instead of on sectorsize. Then maybe 512 byte of one
* mirror could be repaired by taking 512 byte of a different
- * mirror, even if other 512 byte sectors in the same PAGE_SIZE
+ * mirror, even if other 512 byte sectors in the same sectorsize
* area are unreadable.
*/
success = 1;
@@ -1260,7 +1263,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
{
struct scrub_ctx *sctx = original_sblock->sctx;
struct btrfs_fs_info *fs_info = sctx->fs_info;
- u64 length = original_sblock->page_count * PAGE_SIZE;
+ u64 length = original_sblock->page_count * fs_info->sectorsize;
u64 logical = original_sblock->pagev[0]->logical;
u64 generation = original_sblock->pagev[0]->generation;
u64 flags = original_sblock->pagev[0]->flags;
@@ -1283,13 +1286,13 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
*/
while (length > 0) {
- sublen = min_t(u64, length, PAGE_SIZE);
+ sublen = min_t(u64, length, fs_info->sectorsize);
mapped_length = sublen;
bbio = NULL;
/*
- * with a length of PAGE_SIZE, each returned stripe
- * represents one mirror
+ * With a length of sectorsize, each returned stripe represents
+ * one mirror
*/
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
@@ -1480,7 +1483,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
bio = btrfs_io_bio_alloc(1);
bio_set_dev(bio, spage->dev->bdev);
- bio_add_page(bio, spage->page, PAGE_SIZE, 0);
+ bio_add_page(bio, spage->page, fs_info->sectorsize, 0);
bio->bi_iter.bi_sector = spage->physical >> 9;
bio->bi_opf = REQ_OP_READ;
@@ -1544,6 +1547,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_page *spage_bad = sblock_bad->pagev[page_num];
struct scrub_page *spage_good = sblock_good->pagev[page_num];
struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
+ const u32 sectorsize = fs_info->sectorsize;
BUG_ON(spage_bad->page == NULL);
BUG_ON(spage_good->page == NULL);
@@ -1563,8 +1567,8 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
bio->bi_iter.bi_sector = spage_bad->physical >> 9;
bio->bi_opf = REQ_OP_WRITE;
- ret = bio_add_page(bio, spage_good->page, PAGE_SIZE, 0);
- if (PAGE_SIZE != ret) {
+ ret = bio_add_page(bio, spage_good->page, sectorsize, 0);
+ if (ret != sectorsize) {
bio_put(bio);
return -EIO;
}
@@ -1642,6 +1646,7 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
{
struct scrub_bio *sbio;
int ret;
+ const u32 sectorsize = sctx->fs_info->sectorsize;
mutex_lock(&sctx->wr_lock);
again:
@@ -1681,16 +1686,16 @@ again:
bio->bi_iter.bi_sector = sbio->physical >> 9;
bio->bi_opf = REQ_OP_WRITE;
sbio->status = 0;
- } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
+ } else if (sbio->physical + sbio->page_count * sectorsize !=
spage->physical_for_dev_replace ||
- sbio->logical + sbio->page_count * PAGE_SIZE !=
+ sbio->logical + sbio->page_count * sectorsize !=
spage->logical) {
scrub_wr_submit(sctx);
goto again;
}
- ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
- if (ret != PAGE_SIZE) {
+ ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0);
+ if (ret != sectorsize) {
if (sbio->page_count < 1) {
bio_put(sbio->bio);
sbio->bio = NULL;
@@ -1729,7 +1734,8 @@ static void scrub_wr_submit(struct scrub_ctx *sctx)
btrfsic_submit_bio(sbio->bio);
if (btrfs_is_zoned(sctx->fs_info))
- sctx->write_pointer = sbio->physical + sbio->page_count * PAGE_SIZE;
+ sctx->write_pointer = sbio->physical + sbio->page_count *
+ sctx->fs_info->sectorsize;
}
static void scrub_wr_bio_end_io(struct bio *bio)
@@ -1988,6 +1994,65 @@ static void scrub_page_put(struct scrub_page *spage)
}
}
+/*
+ * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
+ * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
+ */
+static void scrub_throttle(struct scrub_ctx *sctx)
+{
+ const int time_slice = 1000;
+ struct scrub_bio *sbio;
+ struct btrfs_device *device;
+ s64 delta;
+ ktime_t now;
+ u32 div;
+ u64 bwlimit;
+
+ sbio = sctx->bios[sctx->curr];
+ device = sbio->dev;
+ bwlimit = READ_ONCE(device->scrub_speed_max);
+ if (bwlimit == 0)
+ return;
+
+ /*
+ * Slice is divided into intervals when the IO is submitted, adjust by
+ * bwlimit and maximum of 64 intervals.
+ */
+ div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
+ div = min_t(u32, 64, div);
+
+ /* Start new epoch, set deadline */
+ now = ktime_get();
+ if (sctx->throttle_deadline == 0) {
+ sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
+ sctx->throttle_sent = 0;
+ }
+
+ /* Still in the time to send? */
+ if (ktime_before(now, sctx->throttle_deadline)) {
+ /* If current bio is within the limit, send it */
+ sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
+ if (sctx->throttle_sent <= div_u64(bwlimit, div))
+ return;
+
+ /* We're over the limit, sleep until the rest of the slice */
+ delta = ktime_ms_delta(sctx->throttle_deadline, now);
+ } else {
+ /* New request after deadline, start new epoch */
+ delta = 0;
+ }
+
+ if (delta) {
+ long timeout;
+
+ timeout = div_u64(delta * HZ, 1000);
+ schedule_timeout_interruptible(timeout);
+ }
+
+ /* Next call will start the deadline period */
+ sctx->throttle_deadline = 0;
+}
+
static void scrub_submit(struct scrub_ctx *sctx)
{
struct scrub_bio *sbio;
@@ -1995,6 +2060,8 @@ static void scrub_submit(struct scrub_ctx *sctx)
if (sctx->curr == -1)
return;
+ scrub_throttle(sctx);
+
sbio = sctx->bios[sctx->curr];
sctx->curr = -1;
scrub_pending_bio_inc(sctx);
@@ -2006,6 +2073,7 @@ static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
{
struct scrub_block *sblock = spage->sblock;
struct scrub_bio *sbio;
+ const u32 sectorsize = sctx->fs_info->sectorsize;
int ret;
again:
@@ -2044,9 +2112,9 @@ again:
bio->bi_iter.bi_sector = sbio->physical >> 9;
bio->bi_opf = REQ_OP_READ;
sbio->status = 0;
- } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
+ } else if (sbio->physical + sbio->page_count * sectorsize !=
spage->physical ||
- sbio->logical + sbio->page_count * PAGE_SIZE !=
+ sbio->logical + sbio->page_count * sectorsize !=
spage->logical ||
sbio->dev != spage->dev) {
scrub_submit(sctx);
@@ -2054,8 +2122,8 @@ again:
}
sbio->pagev[sbio->page_count] = spage;
- ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
- if (ret != PAGE_SIZE) {
+ ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0);
+ if (ret != sectorsize) {
if (sbio->page_count < 1) {
bio_put(sbio->bio);
sbio->bio = NULL;
@@ -2398,7 +2466,7 @@ static void scrub_block_complete(struct scrub_block *sblock)
if (sblock->sparity && corrupted && !sblock->data_corrected) {
u64 start = sblock->pagev[0]->logical;
u64 end = sblock->pagev[sblock->page_count - 1]->logical +
- PAGE_SIZE;
+ sblock->sctx->fs_info->sectorsize;
ASSERT(end - start <= U32_MAX);
scrub_parity_mark_sectors_error(sblock->sparity,
@@ -2418,7 +2486,7 @@ static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *su
* the csum into @csum.
*
* The search source is sctx->csum_list, which is a pre-populated list
- * storing bytenr ordered csum ranges. We're reponsible to cleanup any range
+ * storing bytenr ordered csum ranges. We're responsible to cleanup any range
* that is before @logical.
*
* Return 0 if there is no csum for the range.
@@ -3138,28 +3206,23 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
physical = map->stripes[num].physical;
offset = 0;
nstripes = div64_u64(length, map->stripe_len);
+ mirror_num = 1;
+ increment = map->stripe_len;
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
offset = map->stripe_len * num;
increment = map->stripe_len * map->num_stripes;
- mirror_num = 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
int factor = map->num_stripes / map->sub_stripes;
offset = map->stripe_len * (num / map->sub_stripes);
increment = map->stripe_len * factor;
mirror_num = num % map->sub_stripes + 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
- increment = map->stripe_len;
mirror_num = num % map->num_stripes + 1;
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
- increment = map->stripe_len;
mirror_num = num % map->num_stripes + 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
get_raid56_logic_offset(physical, num, map, &offset, NULL);
increment = map->stripe_len * nr_data_stripes(map);
- mirror_num = 1;
- } else {
- increment = map->stripe_len;
- mirror_num = 1;
}
path = btrfs_alloc_path();