summaryrefslogtreecommitdiff
path: root/fs/ext4/ialloc.c
diff options
context:
space:
mode:
authorTheodore Ts'o <tytso@mit.edu>2009-03-12 19:18:34 +0300
committerTheodore Ts'o <tytso@mit.edu>2009-03-12 19:18:34 +0300
commita4912123b688e057084e6557cef8924f7ae5bbde (patch)
tree34e88705d6617b52caa0f87692b480119a9c9e2e /fs/ext4/ialloc.c
parent2dc6b0d48ca0599837df21b14bb8393d0804af57 (diff)
downloadlinux-a4912123b688e057084e6557cef8924f7ae5bbde.tar.xz
ext4: New inode/block allocation algorithms for flex_bg filesystems
The find_group_flex() inode allocator is now only used if the filesystem is mounted using the "oldalloc" mount option. It is replaced with the original Orlov allocator that has been updated for flex_bg filesystems (it should behave the same way if flex_bg is disabled). The inode allocator now functions by taking into account each flex_bg group, instead of each block group, when deciding whether or not it's time to allocate a new directory into a fresh flex_bg. The block allocator has also been changed so that the first block group in each flex_bg is preferred for use for storing directory blocks. This keeps directory blocks close together, which is good for speeding up e2fsck since large directories are more likely to look like this: debugfs: stat /home/tytso/Maildir/cur Inode: 1844562 Type: directory Mode: 0700 Flags: 0x81000 Generation: 1132745781 Version: 0x00000000:0000ad71 User: 15806 Group: 15806 Size: 1060864 File ACL: 0 Directory ACL: 0 Links: 2 Blockcount: 2072 Fragment: Address: 0 Number: 0 Size: 0 ctime: 0x499c0ff4:164961f4 -- Wed Feb 18 08:41:08 2009 atime: 0x499c0ff4:00000000 -- Wed Feb 18 08:41:08 2009 mtime: 0x49957f51:00000000 -- Fri Feb 13 09:10:25 2009 crtime: 0x499c0f57:00d51440 -- Wed Feb 18 08:38:31 2009 Size of extra inode fields: 28 BLOCKS: (0):7348651, (1-258):7348654-7348911 TOTAL: 259 Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs/ext4/ialloc.c')
-rw-r--r--fs/ext4/ialloc.c215
1 files changed, 159 insertions, 56 deletions
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index ae3eb57dccdd..617f5a2d800a 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -410,6 +410,43 @@ out:
return 0;
}
+struct orlov_stats {
+ __u32 free_inodes;
+ __u32 free_blocks;
+ __u32 used_dirs;
+};
+
+/*
+ * Helper function for Orlov's allocator; returns critical information
+ * for a particular block group or flex_bg. If flex_size is 1, then g
+ * is a block group number; otherwise it is flex_bg number.
+ */
+void get_orlov_stats(struct super_block *sb, ext4_group_t g,
+ int flex_size, struct orlov_stats *stats)
+{
+ struct ext4_group_desc *desc;
+ ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+ int i;
+
+ stats->free_inodes = 0;
+ stats->free_blocks = 0;
+ stats->used_dirs = 0;
+
+ g *= flex_size;
+
+ for (i = 0; i < flex_size; i++) {
+ if (g >= ngroups)
+ break;
+ desc = ext4_get_group_desc(sb, g++, NULL);
+ if (!desc)
+ continue;
+
+ stats->free_inodes += ext4_free_inodes_count(sb, desc);
+ stats->free_blocks += ext4_free_blks_count(sb, desc);
+ stats->used_dirs += ext4_used_dirs_count(sb, desc);
+ }
+}
+
/*
* Orlov's allocator for directories.
*
@@ -425,35 +462,34 @@ out:
* it has too many directories already (max_dirs) or
* it has too few free inodes left (min_inodes) or
* it has too few free blocks left (min_blocks) or
- * it's already running too large debt (max_debt).
* Parent's group is preferred, if it doesn't satisfy these
* conditions we search cyclically through the rest. If none
* of the groups look good we just look for a group with more
* free inodes than average (starting at parent's group).
- *
- * Debt is incremented each time we allocate a directory and decremented
- * when we allocate an inode, within 0--255.
*/
-#define INODE_COST 64
-#define BLOCK_COST 256
-
static int find_group_orlov(struct super_block *sb, struct inode *parent,
- ext4_group_t *group)
+ ext4_group_t *group, int mode)
{
ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_super_block *es = sbi->s_es;
ext4_group_t ngroups = sbi->s_groups_count;
int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
unsigned int freei, avefreei;
ext4_fsblk_t freeb, avefreeb;
- ext4_fsblk_t blocks_per_dir;
unsigned int ndirs;
- int max_debt, max_dirs, min_inodes;
+ int max_dirs, min_inodes;
ext4_grpblk_t min_blocks;
- ext4_group_t i;
+ ext4_group_t i, grp, g;
struct ext4_group_desc *desc;
+ struct orlov_stats stats;
+ int flex_size = ext4_flex_bg_size(sbi);
+
+ if (flex_size > 1) {
+ ngroups = (ngroups + flex_size - 1) >>
+ sbi->s_log_groups_per_flex;
+ parent_group >>= sbi->s_log_groups_per_flex;
+ }
freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
avefreei = freei / ngroups;
@@ -462,71 +498,97 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
do_div(avefreeb, ngroups);
ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
- if ((parent == sb->s_root->d_inode) ||
- (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
+ if (S_ISDIR(mode) &&
+ ((parent == sb->s_root->d_inode) ||
+ (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
int best_ndir = inodes_per_group;
- ext4_group_t grp;
int ret = -1;
get_random_bytes(&grp, sizeof(grp));
parent_group = (unsigned)grp % ngroups;
for (i = 0; i < ngroups; i++) {
- grp = (parent_group + i) % ngroups;
- desc = ext4_get_group_desc(sb, grp, NULL);
- if (!desc || !ext4_free_inodes_count(sb, desc))
+ g = (parent_group + i) % ngroups;
+ get_orlov_stats(sb, g, flex_size, &stats);
+ if (!stats.free_inodes)
continue;
- if (ext4_used_dirs_count(sb, desc) >= best_ndir)
+ if (stats.used_dirs >= best_ndir)
continue;
- if (ext4_free_inodes_count(sb, desc) < avefreei)
+ if (stats.free_inodes < avefreei)
continue;
- if (ext4_free_blks_count(sb, desc) < avefreeb)
+ if (stats.free_blocks < avefreeb)
continue;
- *group = grp;
+ grp = g;
ret = 0;
- best_ndir = ext4_used_dirs_count(sb, desc);
+ best_ndir = stats.used_dirs;
+ }
+ if (ret)
+ goto fallback;
+ found_flex_bg:
+ if (flex_size == 1) {
+ *group = grp;
+ return 0;
+ }
+
+ /*
+ * We pack inodes at the beginning of the flexgroup's
+ * inode tables. Block allocation decisions will do
+ * something similar, although regular files will
+ * start at 2nd block group of the flexgroup. See
+ * ext4_ext_find_goal() and ext4_find_near().
+ */
+ grp *= flex_size;
+ for (i = 0; i < flex_size; i++) {
+ if (grp+i >= sbi->s_groups_count)
+ break;
+ desc = ext4_get_group_desc(sb, grp+i, NULL);
+ if (desc && ext4_free_inodes_count(sb, desc)) {
+ *group = grp+i;
+ return 0;
+ }
}
- if (ret == 0)
- return ret;
goto fallback;
}
- blocks_per_dir = ext4_blocks_count(es) - freeb;
- do_div(blocks_per_dir, ndirs);
-
max_dirs = ndirs / ngroups + inodes_per_group / 16;
- min_inodes = avefreei - inodes_per_group / 4;
- min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4;
-
- max_debt = EXT4_BLOCKS_PER_GROUP(sb);
- max_debt /= max_t(int, blocks_per_dir, BLOCK_COST);
- if (max_debt * INODE_COST > inodes_per_group)
- max_debt = inodes_per_group / INODE_COST;
- if (max_debt > 255)
- max_debt = 255;
- if (max_debt == 0)
- max_debt = 1;
+ min_inodes = avefreei - inodes_per_group*flex_size / 4;
+ if (min_inodes < 1)
+ min_inodes = 1;
+ min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;
+
+ /*
+ * Start looking in the flex group where we last allocated an
+ * inode for this parent directory
+ */
+ if (EXT4_I(parent)->i_last_alloc_group != ~0) {
+ parent_group = EXT4_I(parent)->i_last_alloc_group;
+ if (flex_size > 1)
+ parent_group >>= sbi->s_log_groups_per_flex;
+ }
for (i = 0; i < ngroups; i++) {
- *group = (parent_group + i) % ngroups;
- desc = ext4_get_group_desc(sb, *group, NULL);
- if (!desc || !ext4_free_inodes_count(sb, desc))
- continue;
- if (ext4_used_dirs_count(sb, desc) >= max_dirs)
+ grp = (parent_group + i) % ngroups;
+ get_orlov_stats(sb, grp, flex_size, &stats);
+ if (stats.used_dirs >= max_dirs)
continue;
- if (ext4_free_inodes_count(sb, desc) < min_inodes)
+ if (stats.free_inodes < min_inodes)
continue;
- if (ext4_free_blks_count(sb, desc) < min_blocks)
+ if (stats.free_blocks < min_blocks)
continue;
- return 0;
+ goto found_flex_bg;
}
fallback:
+ ngroups = sbi->s_groups_count;
+ avefreei = freei / ngroups;
+ parent_group = EXT4_I(parent)->i_block_group;
for (i = 0; i < ngroups; i++) {
- *group = (parent_group + i) % ngroups;
- desc = ext4_get_group_desc(sb, *group, NULL);
+ grp = (parent_group + i) % ngroups;
+ desc = ext4_get_group_desc(sb, grp, NULL);
if (desc && ext4_free_inodes_count(sb, desc) &&
- ext4_free_inodes_count(sb, desc) >= avefreei)
+ ext4_free_inodes_count(sb, desc) >= avefreei) {
+ *group = grp;
return 0;
+ }
}
if (avefreei) {
@@ -542,12 +604,51 @@ fallback:
}
static int find_group_other(struct super_block *sb, struct inode *parent,
- ext4_group_t *group)
+ ext4_group_t *group, int mode)
{
ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
struct ext4_group_desc *desc;
- ext4_group_t i;
+ ext4_group_t i, last;
+ int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
+
+ /*
+ * Try to place the inode is the same flex group as its
+ * parent. If we can't find space, use the Orlov algorithm to
+ * find another flex group, and store that information in the
+ * parent directory's inode information so that use that flex
+ * group for future allocations.
+ */
+ if (flex_size > 1) {
+ int retry = 0;
+
+ try_again:
+ parent_group &= ~(flex_size-1);
+ last = parent_group + flex_size;
+ if (last > ngroups)
+ last = ngroups;
+ for (i = parent_group; i < last; i++) {
+ desc = ext4_get_group_desc(sb, i, NULL);
+ if (desc && ext4_free_inodes_count(sb, desc)) {
+ *group = i;
+ return 0;
+ }
+ }
+ if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
+ retry = 1;
+ parent_group = EXT4_I(parent)->i_last_alloc_group;
+ goto try_again;
+ }
+ /*
+ * If this didn't work, use the Orlov search algorithm
+ * to find a new flex group; we pass in the mode to
+ * avoid the topdir algorithms.
+ */
+ *group = parent_group + flex_size;
+ if (*group > ngroups)
+ *group = 0;
+ return find_group_orlov(sb, parent, group, mode);
+ }
/*
* Try to place the inode in its parent directory
@@ -716,10 +817,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
sbi = EXT4_SB(sb);
es = sbi->s_es;
- if (sbi->s_log_groups_per_flex) {
+ if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
ret2 = find_group_flex(sb, dir, &group);
if (ret2 == -1) {
- ret2 = find_group_other(sb, dir, &group);
+ ret2 = find_group_other(sb, dir, &group, mode);
if (ret2 == 0 && once)
once = 0;
printk(KERN_NOTICE "ext4: find_group_flex "
@@ -733,11 +834,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
if (test_opt(sb, OLDALLOC))
ret2 = find_group_dir(sb, dir, &group);
else
- ret2 = find_group_orlov(sb, dir, &group);
+ ret2 = find_group_orlov(sb, dir, &group, mode);
} else
- ret2 = find_group_other(sb, dir, &group);
+ ret2 = find_group_other(sb, dir, &group, mode);
got_group:
+ EXT4_I(dir)->i_last_alloc_group = group;
err = -ENOSPC;
if (ret2 == -1)
goto out;
@@ -894,6 +996,7 @@ got:
ei->i_file_acl = 0;
ei->i_dtime = 0;
ei->i_block_group = group;
+ ei->i_last_alloc_group = ~0;
ext4_set_inode_flags(inode);
if (IS_DIRSYNC(inode))