summaryrefslogtreecommitdiff
path: root/fs/ext4/mballoc.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4/mballoc.c')
-rw-r--r--fs/ext4/mballoc.c289
1 files changed, 244 insertions, 45 deletions
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c0a331e2feb0..132c118d12e1 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -922,7 +922,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
bh[i] = NULL;
continue;
}
- bh[i] = ext4_read_block_bitmap_nowait(sb, group);
+ bh[i] = ext4_read_block_bitmap_nowait(sb, group, false);
if (IS_ERR(bh[i])) {
err = PTR_ERR(bh[i]);
bh[i] = NULL;
@@ -1279,9 +1279,6 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
e4b->bd_buddy_page = page;
e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
- BUG_ON(e4b->bd_bitmap_page == NULL);
- BUG_ON(e4b->bd_buddy_page == NULL);
-
return 0;
err:
@@ -1743,10 +1740,6 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
}
-/*
- * regular allocator, for general purposes allocation
- */
-
static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
struct ext4_buddy *e4b,
int finish_group)
@@ -2119,13 +2112,11 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
BUG_ON(cr < 0 || cr >= 4);
- free = grp->bb_free;
- if (free == 0)
- return false;
- if (cr <= 2 && free < ac->ac_g_ex.fe_len)
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
return false;
- if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
+ free = grp->bb_free;
+ if (free == 0)
return false;
fragments = grp->bb_fragments;
@@ -2142,8 +2133,10 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
((group % flex_size) == 0))
return false;
- if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) ||
- (free / fragments) >= ac->ac_g_ex.fe_len)
+ if (free < ac->ac_g_ex.fe_len)
+ return false;
+
+ if (ac->ac_2order > ac->ac_sb->s_blocksize_bits+1)
return true;
if (grp->bb_largest_free_order < ac->ac_2order)
@@ -2177,6 +2170,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
{
struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
struct super_block *sb = ac->ac_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK;
ext4_grpblk_t free;
int ret = 0;
@@ -2195,7 +2189,25 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
/* We only do this if the grp has never been initialized */
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
- ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
+ struct ext4_group_desc *gdp =
+ ext4_get_group_desc(sb, group, NULL);
+ int ret;
+
+ /* cr=0/1 is a very optimistic search to find large
+ * good chunks almost for free. If buddy data is not
+ * ready, then this optimization makes no sense. But
+ * we never skip the first block group in a flex_bg,
+ * since this gets used for metadata block allocation,
+ * and we want to make sure we locate metadata blocks
+ * in the first block group in the flex_bg if possible.
+ */
+ if (cr < 2 &&
+ (!sbi->s_log_groups_per_flex ||
+ ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) &&
+ !(ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))))
+ return 0;
+ ret = ext4_mb_init_group(sb, group, GFP_NOFS);
if (ret)
return ret;
}
@@ -2209,15 +2221,95 @@ out:
return ret;
}
+/*
+ * Start prefetching @nr block bitmaps starting at @group.
+ * Return the next group which needs to be prefetched.
+ */
+ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
+ unsigned int nr, int *cnt)
+{
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
+ struct buffer_head *bh;
+ struct blk_plug plug;
+
+ blk_start_plug(&plug);
+ while (nr-- > 0) {
+ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
+ NULL);
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+
+ /*
+ * Prefetch block groups with free blocks; but don't
+ * bother if it is marked uninitialized on disk, since
+ * it won't require I/O to read. Also only try to
+ * prefetch once, so we avoid getblk() call, which can
+ * be expensive.
+ */
+ if (!EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
+ EXT4_MB_GRP_NEED_INIT(grp) &&
+ ext4_free_group_clusters(sb, gdp) > 0 &&
+ !(ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
+ bh = ext4_read_block_bitmap_nowait(sb, group, true);
+ if (bh && !IS_ERR(bh)) {
+ if (!buffer_uptodate(bh) && cnt)
+ (*cnt)++;
+ brelse(bh);
+ }
+ }
+ if (++group >= ngroups)
+ group = 0;
+ }
+ blk_finish_plug(&plug);
+ return group;
+}
+
+/*
+ * Prefetching reads the block bitmap into the buffer cache; but we
+ * need to make sure that the buddy bitmap in the page cache has been
+ * initialized. Note that ext4_mb_init_group() will block if the I/O
+ * is not yet completed, or indeed if it was not initiated by
+ * ext4_mb_prefetch did not start the I/O.
+ *
+ * TODO: We should actually kick off the buddy bitmap setup in a work
+ * queue when the buffer I/O is completed, so that we don't block
+ * waiting for the block allocation bitmap read to finish when
+ * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator().
+ */
+void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
+ unsigned int nr)
+{
+ while (nr-- > 0) {
+ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
+ NULL);
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+
+ if (!group)
+ group = ext4_get_groups_count(sb);
+ group--;
+ grp = ext4_get_group_info(sb, group);
+
+ if (EXT4_MB_GRP_NEED_INIT(grp) &&
+ ext4_free_group_clusters(sb, gdp) > 0 &&
+ !(ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
+ if (ext4_mb_init_group(sb, group, GFP_NOFS))
+ break;
+ }
+ }
+}
+
static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
- ext4_group_t ngroups, group, i;
+ ext4_group_t prefetch_grp = 0, ngroups, group, i;
int cr = -1;
int err = 0, first_err = 0;
+ unsigned int nr = 0, prefetch_ios = 0;
struct ext4_sb_info *sbi;
struct super_block *sb;
struct ext4_buddy e4b;
+ int lost;
sb = ac->ac_sb;
sbi = EXT4_SB(sb);
@@ -2237,8 +2329,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
goto out;
/*
- * ac->ac2_order is set only if the fe_len is a power of 2
- * if ac2_order is set we also set criteria to 0 so that we
+ * ac->ac_2order is set only if the fe_len is a power of 2
+ * if ac->ac_2order is set we also set criteria to 0 so that we
* try exact allocation using buddy.
*/
i = fls(ac->ac_g_ex.fe_len);
@@ -2282,6 +2374,7 @@ repeat:
* from the goal value specified
*/
group = ac->ac_g_ex.fe_group;
+ prefetch_grp = group;
for (i = 0; i < ngroups; group++, i++) {
int ret = 0;
@@ -2293,6 +2386,29 @@ repeat:
if (group >= ngroups)
group = 0;
+ /*
+ * Batch reads of the block allocation bitmaps
+ * to get multiple READs in flight; limit
+ * prefetching at cr=0/1, otherwise mballoc can
+ * spend a lot of time loading imperfect groups
+ */
+ if ((prefetch_grp == group) &&
+ (cr > 1 ||
+ prefetch_ios < sbi->s_mb_prefetch_limit)) {
+ unsigned int curr_ios = prefetch_ios;
+
+ nr = sbi->s_mb_prefetch;
+ if (ext4_has_feature_flex_bg(sb)) {
+ nr = (group / sbi->s_mb_prefetch) *
+ sbi->s_mb_prefetch;
+ nr = nr + sbi->s_mb_prefetch - group;
+ }
+ prefetch_grp = ext4_mb_prefetch(sb, group,
+ nr, &prefetch_ios);
+ if (prefetch_ios == curr_ios)
+ nr = 0;
+ }
+
/* This now checks without needing the buddy page */
ret = ext4_mb_good_group_nolock(ac, group, cr);
if (ret <= 0) {
@@ -2341,22 +2457,24 @@ repeat:
* We've been searching too long. Let's try to allocate
* the best chunk we've found so far
*/
-
ext4_mb_try_best_found(ac, &e4b);
if (ac->ac_status != AC_STATUS_FOUND) {
/*
* Someone more lucky has already allocated it.
* The only thing we can do is just take first
* found block(s)
- printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n");
*/
+ lost = atomic_inc_return(&sbi->s_mb_lost_chunks);
+ mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n",
+ ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start,
+ ac->ac_b_ex.fe_len, lost);
+
ac->ac_b_ex.fe_group = 0;
ac->ac_b_ex.fe_start = 0;
ac->ac_b_ex.fe_len = 0;
ac->ac_status = AC_STATUS_CONTINUE;
ac->ac_flags |= EXT4_MB_HINT_FIRST;
cr = 3;
- atomic_inc(&sbi->s_mb_lost_chunks);
goto repeat;
}
}
@@ -2367,6 +2485,10 @@ out:
mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
ac->ac_flags, cr, err);
+
+ if (nr)
+ ext4_mb_prefetch_fini(sb, prefetch_grp, nr);
+
return err;
}
@@ -2439,7 +2561,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
for (i = 0; i <= 13; i++)
seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ?
sg.info.bb_counters[i] : 0);
- seq_printf(seq, " ]\n");
+ seq_puts(seq, " ]\n");
return 0;
}
@@ -2613,6 +2735,26 @@ static int ext4_mb_init_backend(struct super_block *sb)
goto err_freebuddy;
}
+ if (ext4_has_feature_flex_bg(sb)) {
+ /* a single flex group is supposed to be read by a single IO */
+ sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex;
+ sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
+ } else {
+ sbi->s_mb_prefetch = 32;
+ }
+ if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
+ sbi->s_mb_prefetch = ext4_get_groups_count(sb);
+ /* now many real IOs to prefetch within a single allocation at cr=0
+ * given cr=0 is an CPU-related optimization we shouldn't try to
+ * load too many groups, at some point we should start to use what
+ * we've got in memory.
+ * with an average random access time 5ms, it'd take a second to get
+ * 200 groups (* N with flex_bg), so let's make this limit 4
+ */
+ sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
+ if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
+ sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
+
return 0;
err_freebuddy:
@@ -2736,6 +2878,7 @@ int ext4_mb_init(struct super_block *sb)
sbi->s_mb_stats = MB_DEFAULT_STATS;
sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
+ sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC;
/*
* The default group preallocation is 512, which for 4k block
* sizes translates to 2 megabytes. However for bigalloc file
@@ -3090,7 +3233,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
- if (!ext4_data_block_valid(sbi, block, len)) {
+ if (!ext4_inode_block_valid(ac->ac_inode, block, len)) {
ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
"fs metadata", block, block+len);
/* File system mounted not to panic on error
@@ -3674,6 +3817,26 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
}
+static void ext4_mb_mark_pa_deleted(struct super_block *sb,
+ struct ext4_prealloc_space *pa)
+{
+ struct ext4_inode_info *ei;
+
+ if (pa->pa_deleted) {
+ ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
+ pa->pa_type, pa->pa_pstart, pa->pa_lstart,
+ pa->pa_len);
+ return;
+ }
+
+ pa->pa_deleted = 1;
+
+ if (pa->pa_type == MB_INODE_PA) {
+ ei = EXT4_I(pa->pa_inode);
+ atomic_dec(&ei->i_prealloc_active);
+ }
+}
+
static void ext4_mb_pa_callback(struct rcu_head *head)
{
struct ext4_prealloc_space *pa;
@@ -3706,7 +3869,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
return;
}
- pa->pa_deleted = 1;
+ ext4_mb_mark_pa_deleted(sb, pa);
spin_unlock(&pa->pa_lock);
grp_blk = pa->pa_pstart;
@@ -3830,6 +3993,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
spin_lock(pa->pa_obj_lock);
list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
spin_unlock(pa->pa_obj_lock);
+ atomic_inc(&ei->i_prealloc_active);
}
/*
@@ -4040,7 +4204,7 @@ repeat:
}
/* seems this one can be freed ... */
- pa->pa_deleted = 1;
+ ext4_mb_mark_pa_deleted(sb, pa);
/* we can trust pa_free ... */
free += pa->pa_free;
@@ -4103,7 +4267,7 @@ out_dbg:
*
* FIXME!! Make sure it is valid at all the call sites
*/
-void ext4_discard_preallocations(struct inode *inode)
+void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
{
struct ext4_inode_info *ei = EXT4_I(inode);
struct super_block *sb = inode->i_sb;
@@ -4121,15 +4285,19 @@ void ext4_discard_preallocations(struct inode *inode)
mb_debug(sb, "discard preallocation for inode %lu\n",
inode->i_ino);
- trace_ext4_discard_preallocations(inode);
+ trace_ext4_discard_preallocations(inode,
+ atomic_read(&ei->i_prealloc_active), needed);
INIT_LIST_HEAD(&list);
+ if (needed == 0)
+ needed = UINT_MAX;
+
repeat:
/* first, collect all pa's in the inode */
spin_lock(&ei->i_prealloc_lock);
- while (!list_empty(&ei->i_prealloc_list)) {
- pa = list_entry(ei->i_prealloc_list.next,
+ while (!list_empty(&ei->i_prealloc_list) && needed) {
+ pa = list_entry(ei->i_prealloc_list.prev,
struct ext4_prealloc_space, pa_inode_list);
BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
spin_lock(&pa->pa_lock);
@@ -4146,10 +4314,11 @@ repeat:
}
if (pa->pa_deleted == 0) {
- pa->pa_deleted = 1;
+ ext4_mb_mark_pa_deleted(sb, pa);
spin_unlock(&pa->pa_lock);
list_del_rcu(&pa->pa_inode_list);
list_add(&pa->u.pa_tmp_list, &list);
+ needed--;
continue;
}
@@ -4399,7 +4568,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
ac->ac_g_ex = ac->ac_o_ex;
ac->ac_flags = ar->flags;
- /* we have to define context: we'll we work with a file or
+ /* we have to define context: we'll work with a file or
* locality group. this is a policy, actually */
ext4_mb_group_or_file(ac);
@@ -4450,7 +4619,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
BUG_ON(pa->pa_type != MB_GROUP_PA);
/* seems this one can be freed ... */
- pa->pa_deleted = 1;
+ ext4_mb_mark_pa_deleted(sb, pa);
spin_unlock(&pa->pa_lock);
list_del_rcu(&pa->pa_inode_list);
@@ -4549,10 +4718,29 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
}
/*
+ * if per-inode prealloc list is too long, trim some PA
+ */
+static void ext4_mb_trim_inode_pa(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ int count, delta;
+
+ count = atomic_read(&ei->i_prealloc_active);
+ delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1;
+ if (count > sbi->s_mb_max_inode_prealloc + delta) {
+ count -= sbi->s_mb_max_inode_prealloc;
+ ext4_discard_preallocations(inode, count);
+ }
+}
+
+/*
* release all resource we used in allocation
*/
static int ext4_mb_release_context(struct ext4_allocation_context *ac)
{
+ struct inode *inode = ac->ac_inode;
+ struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_prealloc_space *pa = ac->ac_pa;
if (pa) {
@@ -4564,21 +4752,31 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
pa->pa_free -= ac->ac_b_ex.fe_len;
pa->pa_len -= ac->ac_b_ex.fe_len;
spin_unlock(&pa->pa_lock);
+
+ /*
+ * We want to add the pa to the right bucket.
+ * Remove it from the list and while adding
+ * make sure the list to which we are adding
+ * doesn't grow big.
+ */
+ if (likely(pa->pa_free)) {
+ spin_lock(pa->pa_obj_lock);
+ list_del_rcu(&pa->pa_inode_list);
+ spin_unlock(pa->pa_obj_lock);
+ ext4_mb_add_n_trim(ac);
+ }
}
- }
- if (pa) {
- /*
- * We want to add the pa to the right bucket.
- * Remove it from the list and while adding
- * make sure the list to which we are adding
- * doesn't grow big.
- */
- if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
+
+ if (pa->pa_type == MB_INODE_PA) {
+ /*
+ * treat per-inode prealloc list as a lru list, then try
+ * to trim the least recently used PA.
+ */
spin_lock(pa->pa_obj_lock);
- list_del_rcu(&pa->pa_inode_list);
+ list_move(&pa->pa_inode_list, &ei->i_prealloc_list);
spin_unlock(pa->pa_obj_lock);
- ext4_mb_add_n_trim(ac);
}
+
ext4_mb_put_pa(ac, ac->ac_sb, pa);
}
if (ac->ac_bitmap_page)
@@ -4588,6 +4786,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
mutex_unlock(&ac->ac_lg->lg_mutex);
ext4_mb_collect_stats(ac);
+ ext4_mb_trim_inode_pa(inode);
return 0;
}
@@ -4915,7 +5114,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
sbi = EXT4_SB(sb);
if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
- !ext4_data_block_valid(sbi, block, count)) {
+ !ext4_inode_block_valid(inode, block, count)) {
ext4_error(sb, "Freeing blocks not in datazone - "
"block = %llu, count = %lu", block, count);
goto error_return;