diff options
-rw-r--r-- | Documentation/filesystems/ext4.txt | 7 | ||||
-rw-r--r-- | fs/ext4/balloc.c | 4 | ||||
-rw-r--r-- | fs/ext4/ext4.h | 29 | ||||
-rw-r--r-- | fs/ext4/extents.c | 10 | ||||
-rw-r--r-- | fs/ext4/ialloc.c | 18 | ||||
-rw-r--r-- | fs/ext4/inode.c | 143 | ||||
-rw-r--r-- | fs/ext4/ioctl.c | 86 | ||||
-rw-r--r-- | fs/ext4/mballoc.c | 2 | ||||
-rw-r--r-- | fs/ext4/resize.c | 1175 | ||||
-rw-r--r-- | fs/ext4/super.c | 11 | ||||
-rw-r--r-- | fs/ext4/xattr_security.c | 5 | ||||
-rw-r--r-- | fs/jbd2/commit.c | 6 | ||||
-rw-r--r-- | fs/jbd2/revoke.c | 34 | ||||
-rw-r--r-- | fs/jbd2/transaction.c | 5 | ||||
-rw-r--r-- | include/linux/jbd2.h | 1 | ||||
-rw-r--r-- | include/trace/events/ext4.h | 6 |
16 files changed, 1079 insertions, 463 deletions
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 4917cf24a5e0..10ec4639f152 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -581,6 +581,13 @@ Table of Ext4 specific ioctls behaviour may change in the future as it is not necessary and has been done this way only for sake of simplicity. + + EXT4_IOC_RESIZE_FS Resize the filesystem to a new size. The number + of blocks of resized filesystem is passed in via + 64 bit integer argument. The kernel allocates + bitmaps and inode table, the userspace tool thus + just passes the new number of blocks. + .............................................................................. References diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 12ccacda44e0..f9e2cd8cf711 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -23,6 +23,8 @@ #include <trace/events/ext4.h> +static unsigned ext4_num_base_meta_clusters(struct super_block *sb, + ext4_group_t block_group); /* * balloc.c contains the blocks allocation and deallocation routines */ @@ -668,7 +670,7 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) * This function returns the number of file system metadata clusters at * the beginning of a block group, including the reserved gdt blocks. */ -unsigned ext4_num_base_meta_clusters(struct super_block *sb, +static unsigned ext4_num_base_meta_clusters(struct super_block *sb, ext4_group_t block_group) { struct ext4_sb_info *sbi = EXT4_SB(sb); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1554b15f91bc..513004fc3d84 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -511,6 +511,14 @@ struct ext4_new_group_data { __u32 free_blocks_count; }; +/* Indexes used to index group tables in ext4_new_group_data */ +enum { + BLOCK_BITMAP = 0, /* block bitmap */ + INODE_BITMAP, /* inode bitmap */ + INODE_TABLE, /* inode tables */ + GROUP_TABLE_COUNT, +}; + /* * Flags used by ext4_map_blocks() */ @@ -575,6 +583,7 @@ struct ext4_new_group_data { /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ #define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) +#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* @@ -957,12 +966,13 @@ struct ext4_inode_info { #define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \ EXT4_MOUNT2_##opt) -#define ext4_set_bit __test_and_set_bit_le +#define ext4_test_and_set_bit __test_and_set_bit_le +#define ext4_set_bit __set_bit_le #define ext4_set_bit_atomic ext2_set_bit_atomic -#define ext4_clear_bit __test_and_clear_bit_le +#define ext4_test_and_clear_bit __test_and_clear_bit_le +#define ext4_clear_bit __clear_bit_le #define ext4_clear_bit_atomic ext2_clear_bit_atomic #define ext4_test_bit test_bit_le -#define ext4_find_first_zero_bit find_first_zero_bit_le #define ext4_find_next_zero_bit find_next_zero_bit_le #define ext4_find_next_bit find_next_bit_le @@ -1397,6 +1407,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 #define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 #define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200 +#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 @@ -1409,6 +1420,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 #define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ +#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x2000 /* data in inode */ +#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ #define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR #define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ @@ -1790,8 +1803,6 @@ extern void ext4_init_block_bitmap(struct super_block *sb, extern unsigned ext4_free_clusters_after_init(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp); -extern unsigned ext4_num_base_meta_clusters(struct super_block *sb, - ext4_group_t block_group); extern unsigned ext4_num_overhead_clusters(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp); @@ -1880,16 +1891,9 @@ extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); -extern int ext4_block_truncate_page(handle_t *handle, - struct address_space *mapping, loff_t from); -extern int ext4_block_zero_page_range(handle_t *handle, - struct address_space *mapping, loff_t from, loff_t length); extern int ext4_discard_partial_page_buffers(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length, int flags); -extern int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, - struct inode *inode, struct page *page, loff_t from, - loff_t length, int flags); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, @@ -1924,6 +1928,7 @@ extern int ext4_group_add(struct super_block *sb, extern int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_fsblk_t n_blocks_count); +extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); /* super.c */ extern void *ext4_kvmalloc(size_t size, gfp_t flags); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 841faf5fb785..74f23c292e1b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3280,6 +3280,9 @@ static int ext4_find_delalloc_range(struct inode *inode, ext4_lblk_t i, pg_lblk; pgoff_t index; + if (!test_opt(inode->i_sb, DELALLOC)) + return 0; + /* reverse search wont work if fs block size is less than page size */ if (inode->i_blkbits < PAGE_CACHE_SHIFT) search_hint_reverse = 0; @@ -3452,8 +3455,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, int err = 0; ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; - ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" - "block %llu, max_blocks %u, flags %d, allocated %u", + ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical " + "block %llu, max_blocks %u, flags %x, allocated %u\n", inode->i_ino, (unsigned long long)map->m_lblk, map->m_len, flags, allocated); ext4_ext_show_leaf(inode, path); @@ -3624,7 +3627,7 @@ static int get_implied_cluster_alloc(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1); ext4_lblk_t ex_cluster_start, ex_cluster_end; - ext4_lblk_t rr_cluster_start, rr_cluster_end; + ext4_lblk_t rr_cluster_start; ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); ext4_fsblk_t ee_start = ext4_ext_pblock(ex); unsigned short ee_len = ext4_ext_get_actual_len(ex); @@ -3635,7 +3638,6 @@ static int get_implied_cluster_alloc(struct super_block *sb, /* The requested region passed into ext4_map_blocks() */ rr_cluster_start = EXT4_B2C(sbi, map->m_lblk); - rr_cluster_end = EXT4_B2C(sbi, map->m_lblk + map->m_len - 1); if ((rr_cluster_start == ex_cluster_end) || (rr_cluster_start == ex_cluster_start)) { diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 4637af036d9c..25d8c9781ad9 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -252,7 +252,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) fatal = ext4_journal_get_write_access(handle, bh2); } ext4_lock_group(sb, block_group); - cleared = ext4_clear_bit(bit, bitmap_bh->b_data); + cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data); if (fatal || !cleared) { ext4_unlock_group(sb, block_group); goto out; @@ -358,7 +358,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t real_ngroups = ext4_get_groups_count(sb); int inodes_per_group = EXT4_INODES_PER_GROUP(sb); - unsigned int freei, avefreei; + unsigned int freei, avefreei, grp_free; ext4_fsblk_t freeb, avefreec; unsigned int ndirs; int max_dirs, min_inodes; @@ -477,8 +477,8 @@ fallback_retry: for (i = 0; i < ngroups; i++) { grp = (parent_group + i) % ngroups; desc = ext4_get_group_desc(sb, grp, NULL); - if (desc && ext4_free_inodes_count(sb, desc) && - ext4_free_inodes_count(sb, desc) >= avefreei) { + grp_free = ext4_free_inodes_count(sb, desc); + if (desc && grp_free && grp_free >= avefreei) { *group = grp; return 0; } @@ -618,7 +618,7 @@ static int ext4_claim_inode(struct super_block *sb, */ down_read(&grp->alloc_sem); ext4_lock_group(sb, group); - if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) { + if (ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data)) { /* not a free inode */ retval = 1; goto err_ret; @@ -885,8 +885,12 @@ got: if (IS_DIRSYNC(inode)) ext4_handle_sync(handle); if (insert_inode_locked(inode) < 0) { - err = -EINVAL; - goto fail_drop; + /* + * Likely a bitmap corruption causing inode to be allocated + * twice. + */ + err = -EIO; + goto fail; } spin_lock(&sbi->s_next_gen_lock); inode->i_generation = sbi->s_next_generation++; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index aa8efa6572d6..feaa82fe629d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -71,6 +71,9 @@ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); static int __ext4_journalled_writepage(struct page *page, unsigned int len); static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); +static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, + struct inode *inode, struct page *page, loff_t from, + loff_t length, int flags); /* * Test whether an inode is a fast symlink. @@ -2759,7 +2762,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, if (!io_end || !size) goto out; - ext_debug("ext4_end_io_dio(): io_end 0x%p" + ext_debug("ext4_end_io_dio(): io_end 0x%p " "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", iocb->private, io_end->inode->i_ino, iocb, offset, size); @@ -3160,7 +3163,7 @@ int ext4_discard_partial_page_buffers(handle_t *handle, * * Returns zero on sucess or negative on failure. */ -int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, +static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, struct inode *inode, struct page *page, loff_t from, loff_t length, int flags) { @@ -3300,126 +3303,6 @@ next: return err; } -/* - * ext4_block_truncate_page() zeroes out a mapping from file offset `from' - * up to the end of the block which corresponds to `from'. - * This required during truncate. We need to physically zero the tail end - * of that block so it doesn't yield old data if the file is later grown. - */ -int ext4_block_truncate_page(handle_t *handle, - struct address_space *mapping, loff_t from) -{ - unsigned offset = from & (PAGE_CACHE_SIZE-1); - unsigned length; - unsigned blocksize; - struct inode *inode = mapping->host; - - blocksize = inode->i_sb->s_blocksize; - length = blocksize - (offset & (blocksize - 1)); - - return ext4_block_zero_page_range(handle, mapping, from, length); -} - -/* - * ext4_block_zero_page_range() zeros out a mapping of length 'length' - * starting from file offset 'from'. The range to be zero'd must - * be contained with in one block. If the specified range exceeds - * the end of the block it will be shortened to end of the block - * that cooresponds to 'from' - */ -int ext4_block_zero_page_range(handle_t *handle, - struct address_space *mapping, loff_t from, loff_t length) -{ - ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE-1); - unsigned blocksize, max, pos; - ext4_lblk_t iblock; - struct inode *inode = mapping->host; - struct buffer_head *bh; - struct page *page; - int err = 0; - - page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, - mapping_gfp_mask(mapping) & ~__GFP_FS); - if (!page) - return -ENOMEM; - - blocksize = inode->i_sb->s_blocksize; - max = blocksize - (offset & (blocksize - 1)); - - /* - * correct length if it does not fall between - * 'from' and the end of the block - */ - if (length > max || length < 0) - length = max; - - iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); - - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); - - /* Find the buffer that contains "offset" */ - bh = page_buffers(page); - pos = blocksize; - while (offset >= pos) { - bh = bh->b_this_page; - iblock++; - pos += blocksize; - } - - err = 0; - if (buffer_freed(bh)) { - BUFFER_TRACE(bh, "freed: skip"); - goto unlock; - } - - if (!buffer_mapped(bh)) { - BUFFER_TRACE(bh, "unmapped"); - ext4_get_block(inode, iblock, bh, 0); - /* unmapped? It's a hole - nothing to do */ - if (!buffer_mapped(bh)) { - BUFFER_TRACE(bh, "still unmapped"); - goto unlock; - } - } - - /* Ok, it's mapped. Make sure it's up-to-date */ - if (PageUptodate(page)) - set_buffer_uptodate(bh); - - if (!buffer_uptodate(bh)) { - err = -EIO; - ll_rw_block(READ, 1, &bh); - wait_on_buffer(bh); - /* Uhhuh. Read error. Complain and punt. */ - if (!buffer_uptodate(bh)) - goto unlock; - } - - if (ext4_should_journal_data(inode)) { - BUFFER_TRACE(bh, "get write access"); - err = ext4_journal_get_write_access(handle, bh); - if (err) - goto unlock; - } - - zero_user(page, offset, length); - - BUFFER_TRACE(bh, "zeroed end of block"); - - err = 0; - if (ext4_should_journal_data(inode)) { - err = ext4_handle_dirty_metadata(handle, inode, bh); - } else - mark_buffer_dirty(bh); - -unlock: - unlock_page(page); - page_cache_release(page); - return err; -} - int ext4_can_truncate(struct inode *inode) { if (S_ISREG(inode->i_mode)) @@ -4646,9 +4529,19 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) return 0; if (is_journal_aborted(journal)) return -EROFS; + /* We have to allocate physical blocks for delalloc blocks + * before flushing journal. otherwise delalloc blocks can not + * be allocated any more. even more truncate on delalloc blocks + * could trigger BUG by flushing delalloc blocks in journal. + * There is no delalloc block in non-journal data mode. + */ + if (val && test_opt(inode->i_sb, DELALLOC)) { + err = ext4_alloc_da_blocks(inode); + if (err < 0) + return err; + } jbd2_journal_lock_updates(journal); - jbd2_journal_flush(journal); /* * OK, there are no updates running now, and all cached data is @@ -4660,8 +4553,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) if (val) ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); - else + else { + jbd2_journal_flush(journal); ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); + } ext4_set_aops(inode); jbd2_journal_unlock_updates(journal); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index e87a932b073b..6eee25591b81 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -18,6 +18,8 @@ #include "ext4_jbd2.h" #include "ext4.h" +#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1) + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = filp->f_dentry->d_inode; @@ -186,19 +188,22 @@ setversion_out: if (err) return err; - if (get_user(n_blocks_count, (__u32 __user *)arg)) - return -EFAULT; + if (get_user(n_blocks_count, (__u32 __user *)arg)) { + err = -EFAULT; + goto group_extend_out; + } if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { ext4_msg(sb, KERN_ERR, "Online resizing not supported with bigalloc"); - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + goto group_extend_out; } err = mnt_want_write_file(filp); if (err) - return err; + goto group_extend_out; err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); if (EXT4_SB(sb)->s_journal) { @@ -209,8 +214,8 @@ setversion_out: if (err == 0) err = err2; mnt_drop_write_file(filp); +group_extend_out: ext4_resize_end(sb); - return err; } @@ -251,8 +256,7 @@ setversion_out: err = ext4_move_extents(filp, donor_filp, me.orig_start, me.donor_start, me.len, &me.moved_len); mnt_drop_write_file(filp); - if (me.moved_len > 0) - file_remove_suid(donor_filp); + mnt_drop_write(filp->f_path.mnt); if (copy_to_user((struct move_extent __user *)arg, &me, sizeof(me))) @@ -271,19 +275,22 @@ mext_out: return err; if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, - sizeof(input))) - return -EFAULT; + sizeof(input))) { + err = -EFAULT; + goto group_add_out; + } if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { ext4_msg(sb, KERN_ERR, "Online resizing not supported with bigalloc"); - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + goto group_add_out; } err = mnt_want_write_file(filp); if (err) - return err; + goto group_add_out; err = ext4_group_add(sb, &input); if (EXT4_SB(sb)->s_journal) { @@ -294,8 +301,8 @@ mext_out: if (err == 0) err = err2; mnt_drop_write_file(filp); +group_add_out: ext4_resize_end(sb); - return err; } @@ -335,6 +342,60 @@ mext_out: return err; } + case EXT4_IOC_RESIZE_FS: { + ext4_fsblk_t n_blocks_count; + struct super_block *sb = inode->i_sb; + int err = 0, err2 = 0; + + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + ext4_msg(sb, KERN_ERR, + "Online resizing not (yet) supported with bigalloc"); + return -EOPNOTSUPP; + } + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_META_BG)) { + ext4_msg(sb, KERN_ERR, + "Online resizing not (yet) supported with meta_bg"); + return -EOPNOTSUPP; + } + + if (copy_from_user(&n_blocks_count, (__u64 __user *)arg, + sizeof(__u64))) { + return -EFAULT; + } + + if (n_blocks_count > MAX_32_NUM && + !EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_64BIT)) { + ext4_msg(sb, KERN_ERR, + "File system only supports 32-bit block numbers"); + return -EOPNOTSUPP; + } + + err = ext4_resize_begin(sb); + if (err) + return err; + + err = mnt_want_write(filp->f_path.mnt); + if (err) + goto resizefs_out; + + err = ext4_resize_fs(sb, n_blocks_count); + if (EXT4_SB(sb)->s_journal) { + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + } + if (err == 0) + err = err2; + mnt_drop_write(filp->f_path.mnt); +resizefs_out: + ext4_resize_end(sb); + return err; + } + case FITRIM: { struct request_queue *q = bdev_get_queue(sb->s_bdev); @@ -433,6 +494,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) } case EXT4_IOC_MOVE_EXT: case FITRIM: + case EXT4_IOC_RESIZE_FS: break; default: return -ENOIOCTLCMD; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e2d8be8f28bf..cb990b21c698 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3671,7 +3671,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, ext4_group_t group; ext4_grpblk_t bit; - trace_ext4_mb_release_group_pa(pa); + trace_ext4_mb_release_group_pa(sb, pa); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 996780ab4f4e..f9d948f0eb86 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -134,6 +134,172 @@ static int verify_group_input(struct super_block *sb, return err; } +/* + * ext4_new_flex_group_data is used by 64bit-resize interface to add a flex + * group each time. + */ +struct ext4_new_flex_group_data { + struct ext4_new_group_data *groups; /* new_group_data for groups + in the flex group */ + __u16 *bg_flags; /* block group flags of groups + in @groups */ + ext4_group_t count; /* number of groups in @groups + */ +}; + +/* + * alloc_flex_gd() allocates a ext4_new_flex_group_data with size of + * @flexbg_size. + * + * Returns NULL on failure otherwise address of the allocated structure. + */ +static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size) +{ + struct ext4_new_flex_group_data *flex_gd; + + flex_gd = kmalloc(sizeof(*flex_gd), GFP_NOFS); + if (flex_gd == NULL) + goto out3; + + flex_gd->count = flexbg_size; + + flex_gd->groups = kmalloc(sizeof(struct ext4_new_group_data) * + flexbg_size, GFP_NOFS); + if (flex_gd->groups == NULL) + goto out2; + + flex_gd->bg_flags = kmalloc(flexbg_size * sizeof(__u16), GFP_NOFS); + if (flex_gd->bg_flags == NULL) + goto out1; + + return flex_gd; + +out1: + kfree(flex_gd->groups); +out2: + kfree(flex_gd); +out3: + return NULL; +} + +static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd) +{ + kfree(flex_gd->bg_flags); + kfree(flex_gd->groups); + kfree(flex_gd); +} + +/* + * ext4_alloc_group_tables() allocates block bitmaps, inode bitmaps + * and inode tables for a flex group. + * + * This function is used by 64bit-resize. Note that this function allocates + * group tables from the 1st group of groups contained by @flexgd, which may + * be a partial of a flex group. + * + * @sb: super block of fs to which the groups belongs + */ +static void ext4_alloc_group_tables(struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd, + int flexbg_size) +{ + struct ext4_new_group_data *group_data = flex_gd->groups; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + ext4_fsblk_t start_blk; + ext4_fsblk_t last_blk; + ext4_group_t src_group; + ext4_group_t bb_index = 0; + ext4_group_t ib_index = 0; + ext4_group_t it_index = 0; + ext4_group_t group; + ext4_group_t last_group; + unsigned overhead; + + BUG_ON(flex_gd->count == 0 || group_data == NULL); + + src_group = group_data[0].group; + last_group = src_group + flex_gd->count - 1; + + BUG_ON((flexbg_size > 1) && ((src_group & ~(flexbg_size - 1)) != + (last_group & ~(flexbg_size - 1)))); +next_group: + group = group_data[0].group; + start_blk = ext4_group_first_block_no(sb, src_group); + last_blk = start_blk + group_data[src_group - group].blocks_count; + + overhead = ext4_bg_has_super(sb, src_group) ? + (1 + ext4_bg_num_gdb(sb, src_group) + + le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; + + start_blk += overhead; + + BUG_ON(src_group >= group_data[0].group + flex_gd->count); + /* We collect contiguous blocks as much as possible. */ + src_group++; + for (; src_group <= last_group; src_group++) + if (!ext4_bg_has_super(sb, src_group)) + last_blk += group_data[src_group - group].blocks_count; + else + break; + + /* Allocate block bitmaps */ + for (; bb_index < flex_gd->count; bb_index++) { + if (start_blk >= last_blk) + goto next_group; + group_data[bb_index].block_bitmap = start_blk++; + ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL); + group -= group_data[0].group; + group_data[group].free_blocks_count--; + if (flexbg_size > 1) + flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; + } + + /* Allocate inode bitmaps */ + for (; ib_index < flex_gd->count; ib_index++) { + if (start_blk >= last_blk) + goto next_group; + group_data[ib_index].inode_bitmap = start_blk++; + ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL); + group -= group_data[0].group; + group_data[group].free_blocks_count--; + if (flexbg_size > 1) + flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; + } + + /* Allocate inode tables */ + for (; it_index < flex_gd->count; it_index++) { + if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk) + goto next_group; + group_data[it_index].inode_table = start_blk; + ext4_get_group_no_and_offset(sb, start_blk, &group, NULL); + group -= group_data[0].group; + group_data[group].free_blocks_count -= + EXT4_SB(sb)->s_itb_per_group; + if (flexbg_size > 1) + flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; + + start_blk += EXT4_SB(sb)->s_itb_per_group; + } + + if (test_opt(sb, DEBUG)) { + int i; + group = group_data[0].group; + + printk(KERN_DEBUG "EXT4-fs: adding a flex group with " + "%d groups, flexbg size is %d:\n", flex_gd->count, + flexbg_size); + + for (i = 0; i < flex_gd->count; i++) { + printk(KERN_DEBUG "adding %s group %u: %u " + "blocks (%d free)\n", + ext4_bg_has_super(sb, group + i) ? "normal" : + "no-super", group + i, + group_data[i].blocks_count, + group_data[i].free_blocks_count); + } + } +} + static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, ext4_fsblk_t blk) { @@ -179,131 +345,250 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh) } /* - * Set up the block and inode bitmaps, and the inode table for the new group. + * set_flexbg_block_bitmap() mark @count blocks starting from @block used. + * + * Helper function for ext4_setup_new_group_blocks() which set . + * + * @sb: super block + * @handle: journal handle + * @flex_gd: flex group data + */ +static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, + struct ext4_new_flex_group_data *flex_gd, + ext4_fsblk_t block, ext4_group_t count) +{ + ext4_group_t count2; + + ext4_debug("mark blocks [%llu/%u] used\n", block, count); + for (count2 = count; count > 0; count -= count2, block += count2) { + ext4_fsblk_t start; + struct buffer_head *bh; + ext4_group_t group; + int err; + + ext4_get_group_no_and_offset(sb, block, &group, NULL); + start = ext4_group_first_block_no(sb, group); + group -= flex_gd->groups[0].group; + + count2 = sb->s_blocksize * 8 - (block - start); + if (count2 > count) + count2 = count; + + if (flex_gd->bg_flags[group] & EXT4_BG_BLOCK_UNINIT) { + BUG_ON(flex_gd->count > 1); + continue; + } + + err = extend_or_restart_transaction(handle, 1); + if (err) + return err; + + bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap); + if (!bh) + return -EIO; + + err = ext4_journal_get_write_access(handle, bh); + if (err) + return err; + ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", block, + block - start, count2); + ext4_set_bits(bh->b_data, block - start, count2); + + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (unlikely(err)) + return err; + brelse(bh); + } + + return 0; +} + +/* + * Set up the block and inode bitmaps, and the inode table for the new groups. * This doesn't need to be part of the main transaction, since we are only * changing blocks outside the actual filesystem. We still do journaling to * ensure the recovery is correct in case of a failure just after resize. * If any part of this fails, we simply abort the resize. + * + * setup_new_flex_group_blocks handles a flex group as follow: + * 1. copy super block and GDT, and initialize group tables if necessary. + * In this step, we only set bits in blocks bitmaps for blocks taken by + * super block and GDT. + * 2. allocate group tables in block bitmaps, that is, set bits in block + * bitmap for blocks taken by group tables. */ -static int setup_new_group_blocks(struct super_block *sb, - struct ext4_new_group_data *input) +static int setup_new_flex_group_blocks(struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd) { + int group_table_count[] = {1, 1, EXT4_SB(sb)->s_itb_per_group}; + ext4_fsblk_t start; + ext4_fsblk_t block; struct ext4_sb_info *sbi = EXT4_SB(sb); - ext4_fsblk_t start = ext4_group_first_block_no(sb, input->group); - int reserved_gdb = ext4_bg_has_super(sb, input->group) ? - le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0; - unsigned long gdblocks = ext4_bg_num_gdb(sb, input->group); - struct buffer_head *bh; + struct ext4_super_block *es = sbi->s_es; + struct ext4_new_group_data *group_data = flex_gd->groups; + __u16 *bg_flags = flex_gd->bg_flags; handle_t *handle; - ext4_fsblk_t block; - ext4_grpblk_t bit; - int i; - int err = 0, err2; + ext4_group_t group, count; + struct buffer_head *bh = NULL; + int reserved_gdb, i, j, err = 0, err2; + + BUG_ON(!flex_gd->count || !group_data || + group_data[0].group != sbi->s_groups_count); + + reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks); /* This transaction may be extended/restarted along the way */ handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA); - if (IS_ERR(handle)) return PTR_ERR(handle); - BUG_ON(input->group != sbi->s_groups_count); + group = group_data[0].group; + for (i = 0; i < flex_gd->count; i++, group++) { + unsigned long gdblocks; - /* Copy all of the GDT blocks into the backup in this group */ - for (i = 0, bit = 1, block = start + 1; - i < gdblocks; i++, block++, bit++) { - struct buffer_head *gdb; + gdblocks = ext4_bg_num_gdb(sb, group); + start = ext4_group_first_block_no(sb, group); - ext4_debug("update backup group %#04llx (+%d)\n", block, bit); - err = extend_or_restart_transaction(handle, 1); - if (err) - goto exit_journal; + /* Copy all of the GDT blocks into the backup in this group */ + for (j = 0, block = start + 1; j < gdblocks; j++, block++) { + struct buffer_head *gdb; - gdb = sb_getblk(sb, block); - if (!gdb) { - err = -EIO; - goto exit_journal; - } - if ((err = ext4_journal_get_write_access(handle, gdb))) { + ext4_debug("update backup group %#04llx\n", block); + err = extend_or_restart_transaction(handle, 1); + if (err) + goto out; + + gdb = sb_getblk(sb, block); + if (!gdb) { + err = -EIO; + goto out; + } + + err = ext4_journal_get_write_access(handle, gdb); + if (err) { + brelse(gdb); + goto out; + } + memcpy(gdb->b_data, sbi->s_group_desc[j]->b_data, + gdb->b_size); + set_buffer_uptodate(gdb); + + err = ext4_handle_dirty_metadata(handle, NULL, gdb); + if (unlikely(err)) { + brelse(gdb); + goto out; + } brelse(gdb); - goto exit_journal; } - memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); - set_buffer_uptodate(gdb); - err = ext4_handle_dirty_metadata(handle, NULL, gdb); - if (unlikely(err)) { - brelse(gdb); - goto exit_journal; + + /* Zero out all of the reserved backup group descriptor + * table blocks + */ + if (ext4_bg_has_super(sb, group)) { + err = sb_issue_zeroout(sb, gdblocks + start + 1, + reserved_gdb, GFP_NOFS); + if (err) + goto out; } - brelse(gdb); - } - /* Zero out all of the reserved backup group descriptor table blocks */ - ext4_debug("clear inode table blocks %#04llx -> %#04lx\n", - block, sbi->s_itb_per_group); - err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb, - GFP_NOFS); - if (err) - goto exit_journal; + /* Initialize group tables of the grop @group */ + if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED)) + goto handle_bb; - err = extend_or_restart_transaction(handle, 2); - if (err) - goto exit_journal; + /* Zero out all of the inode table blocks */ + block = group_data[i].inode_table; + ext4_debug("clear inode table blocks %#04llx -> %#04lx\n", + block, sbi->s_itb_per_group); + err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, + GFP_NOFS); + if (err) + goto out; - bh = bclean(handle, sb, input->block_bitmap); - if (IS_ERR(bh)) { - err = PTR_ERR(bh); - goto exit_journal; - } +handle_bb: + if (bg_flags[i] & EXT4_BG_BLOCK_UNINIT) + goto handle_ib; - if (ext4_bg_has_super(sb, input->group)) { - ext4_debug("mark backup group tables %#04llx (+0)\n", start); - ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + 1); - } + /* Initialize block bitmap of the @group */ + block = group_data[i].block_bitmap; + err = extend_or_restart_transaction(handle, 1); + if (err) + goto out; - ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap, - input->block_bitmap - start); - ext4_set_bit(input->block_bitmap - start, bh->b_data); - ext4_debug("mark inode bitmap %#04llx (+%llu)\n", input->inode_bitmap, - input->inode_bitmap - start); - ext4_set_bit(input->inode_bitmap - start, bh->b_data); - - /* Zero out all of the inode table blocks */ - block = input->inode_table; - ext4_debug("clear inode table blocks %#04llx -> %#04lx\n", - block, sbi->s_itb_per_group); - err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); - if (err) - goto exit_bh; - ext4_set_bits(bh->b_data, input->inode_table - start, - sbi->s_itb_per_group); + bh = bclean(handle, sb, block); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + goto out; + } + if (ext4_bg_has_super(sb, group)) { + ext4_debug("mark backup superblock %#04llx (+0)\n", + start); + ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + + 1); + } + ext4_mark_bitmap_end(group_data[i].blocks_count, + sb->s_blocksize * 8, bh->b_data); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (err) + goto out; + brelse(bh); +handle_ib: + if (bg_flags[i] & EXT4_BG_INODE_UNINIT) + continue; - ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, - bh->b_data); - err = ext4_handle_dirty_metadata(handle, NULL, bh); - if (unlikely(err)) { - ext4_std_error(sb, err); - goto exit_bh; + /* Initialize inode bitmap of the @group */ + block = group_data[i].inode_bitmap; + err = extend_or_restart_transaction(handle, 1); + if (err) + goto out; + /* Mark unused entries in inode bitmap used */ + bh = bclean(handle, sb, block); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + goto out; + } + + ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), + sb->s_blocksize * 8, bh->b_data); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (err) + goto out; + brelse(bh); } - brelse(bh); - /* Mark unused entries in inode bitmap used */ - ext4_debug("clear inode bitmap %#04llx (+%llu)\n", - input->inode_bitmap, input->inode_bitmap - start); - if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) { - err = PTR_ERR(bh); - goto exit_journal; + bh = NULL; + + /* Mark group tables in block bitmap */ + for (j = 0; j < GROUP_TABLE_COUNT; j++) { + count = group_table_count[j]; + start = (&group_data[0].block_bitmap)[j]; + block = start; + for (i = 1; i < flex_gd->count; i++) { + block += group_table_count[j]; + if (block == (&group_data[i].block_bitmap)[j]) { + count += group_table_count[j]; + continue; + } + err = set_flexbg_block_bitmap(sb, handle, + flex_gd, start, count); + if (err) + goto out; + count = group_table_count[j]; + start = group_data[i].block_bitmap; + block = start; + } + + if (count) { + err = set_flexbg_block_bitmap(sb, handle, + flex_gd, start, count); + if (err) + goto out; + } } - ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, - bh->b_data); - err = ext4_handle_dirty_metadata(handle, NULL, bh); - if (unlikely(err)) - ext4_std_error(sb, err); -exit_bh: +out: brelse(bh); - -exit_journal: - if ((err2 = ext4_journal_stop(handle)) && !err) + err2 = ext4_journal_stop(handle); + if (err2 && !err) err = err2; return err; @@ -351,10 +636,10 @@ static unsigned ext4_list_backups(struct super_block *sb, unsigned *three, * groups in current filesystem that have BACKUPS, or -ve error code. */ static int verify_reserved_gdb(struct super_block *sb, + ext4_group_t end, struct buffer_head *primary) { const ext4_fsblk_t blk = primary->b_blocknr; - const ext4_group_t end = EXT4_SB(sb)->s_groups_count; unsigned three = 1; unsigned five = 5; unsigned seven = 7; @@ -429,7 +714,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, if (!gdb_bh) return -EIO; - gdbackups = verify_reserved_gdb(sb, gdb_bh); + gdbackups = verify_reserved_gdb(sb, group, gdb_bh); if (gdbackups < 0) { err = gdbackups; goto exit_bh; @@ -592,7 +877,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, err = -EIO; goto exit_bh; } - if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) { + gdbackups = verify_reserved_gdb(sb, group, primary[res]); + if (gdbackups < 0) { brelse(primary[res]); err = gdbackups; goto exit_bh; @@ -735,6 +1021,348 @@ exit_err: } } +/* + * ext4_add_new_descs() adds @count group descriptor of groups + * starting at @group + * + * @handle: journal handle + * @sb: super block + * @group: the group no. of the first group desc to be added + * @resize_inode: the resize inode + * @count: number of group descriptors to be added + */ +static int ext4_add_new_descs(handle_t *handle, struct super_block *sb, + ext4_group_t group, struct inode *resize_inode, + ext4_group_t count) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + struct buffer_head *gdb_bh; + int i, gdb_off, gdb_num, err = 0; + + for (i = 0; i < count; i++, group++) { + int reserved_gdb = ext4_bg_has_super(sb, group) ? + le16_to_cpu(es->s_reserved_gdt_blocks) : 0; + + gdb_off = group % EXT4_DESC_PER_BLOCK(sb); + gdb_num = group / EXT4_DESC_PER_BLOCK(sb); + + /* + * We will only either add reserved group blocks to a backup group + * or remove reserved blocks for the first group in a new group block. + * Doing both would be mean more complex code, and sane people don't + * use non-sparse filesystems anymore. This is already checked above. + */ + if (gdb_off) { + gdb_bh = sbi->s_group_desc[gdb_num]; + err = ext4_journal_get_write_access(handle, gdb_bh); + + if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group)) + err = reserve_backup_gdb(handle, resize_inode, group); + } else + err = add_new_gdb(handle, resize_inode, group); + if (err) + break; + } + return err; +} + +/* + * ext4_setup_new_descs() will set up the group descriptor descriptors of a flex bg + */ +static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd) +{ + struct ext4_new_group_data *group_data = flex_gd->groups; + struct ext4_group_desc *gdp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct buffer_head *gdb_bh; + ext4_group_t group; + __u16 *bg_flags = flex_gd->bg_flags; + int i, gdb_off, gdb_num, err = 0; + + + for (i = 0; i < flex_gd->count; i++, group_data++, bg_flags++) { + group = group_data->group; + + gdb_off = group % EXT4_DESC_PER_BLOCK(sb); + gdb_num = group / EXT4_DESC_PER_BLOCK(sb); + + /* + * get_write_access() has been called on gdb_bh by ext4_add_new_desc(). + */ + gdb_bh = sbi->s_group_desc[gdb_num]; + /* Update group descriptor block for new group */ + gdp = (struct ext4_group_desc *)((char *)gdb_bh->b_data + + gdb_off * EXT4_DESC_SIZE(sb)); + + memset(gdp, 0, EXT4_DESC_SIZE(sb)); + ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap); + ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap); + ext4_inode_table_set(sb, gdp, group_data->inode_table); + ext4_free_group_clusters_set(sb, gdp, + EXT4_B2C(sbi, group_data->free_blocks_count)); + ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); + gdp->bg_flags = cpu_to_le16(*bg_flags); + gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); + + err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); + if (unlikely(err)) { + ext4_std_error(sb, err); + break; + } + + /* + * We can allocate memory for mb_alloc based on the new group + * descriptor + */ + err = ext4_mb_add_groupinfo(sb, group, gdp); + if (err) + break; + } + return err; +} + +/* + * ext4_update_super() updates the super block so that the newly added + * groups can be seen by the filesystem. + * + * @sb: super block + * @flex_gd: new added groups + */ +static void ext4_update_super(struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd) +{ + ext4_fsblk_t blocks_count = 0; + ext4_fsblk_t free_blocks = 0; + ext4_fsblk_t reserved_blocks = 0; + struct ext4_new_group_data *group_data = flex_gd->groups; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int i; + + BUG_ON(flex_gd->count == 0 || group_data == NULL); + /* + * Make the new blocks and inodes valid next. We do this before + * increasing the group count so that once the group is enabled, + * all of its blocks and inodes are already valid. + * + * We always allocate group-by-group, then block-by-block or + * inode-by-inode within a group, so enabling these + * blocks/inodes before the group is live won't actually let us + * allocate the new space yet. + */ + for (i = 0; i < flex_gd->count; i++) { + blocks_count += group_data[i].blocks_count; + free_blocks += group_data[i].free_blocks_count; + } + + reserved_blocks = ext4_r_blocks_count(es) * 100; + do_div(reserved_blocks, ext4_blocks_count(es)); + reserved_blocks *= blocks_count; + do_div(reserved_blocks, 100); + + ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count); + le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) * + flex_gd->count); + + /* + * We need to protect s_groups_count against other CPUs seeing + * inconsistent state in the superblock. + * + * The precise rules we use are: + * + * * Writers must perform a smp_wmb() after updating all + * dependent data and before modifying the groups count + * + * * Readers must perform an smp_rmb() after reading the groups + * count and before reading any dependent data. + * + * NB. These rules can be relaxed when checking the group count + * while freeing data, as we can only allocate from a block + * group after serialising against the group count, and we can + * only then free after serialising in turn against that + * allocation. + */ + smp_wmb(); + + /* Update the global fs size fields */ + sbi->s_groups_count += flex_gd->count; + + /* Update the reserved block counts only once the new group is + * active. */ + ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) + + reserved_blocks); + + /* Update the free space counts */ + percpu_counter_add(&sbi->s_freeclusters_counter, + EXT4_B2C(sbi, free_blocks)); + percpu_counter_add(&sbi->s_freeinodes_counter, + EXT4_INODES_PER_GROUP(sb) * flex_gd->count); + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_FLEX_BG) && + sbi->s_log_groups_per_flex) { + ext4_group_t flex_group; + flex_group = ext4_flex_group(sbi, group_data[0].group); + atomic_add(EXT4_B2C(sbi, free_blocks), + &sbi->s_flex_groups[flex_group].free_clusters); + atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count, + &sbi->s_flex_groups[flex_group].free_inodes); + } + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT4-fs: added group %u:" + "%llu blocks(%llu free %llu reserved)\n", flex_gd->count, + blocks_count, free_blocks, reserved_blocks); +} + +/* Add a flex group to an fs. Ensure we handle all possible error conditions + * _before_ we start modifying the filesystem, because we cannot abort the + * transaction and not have it write the data to disk. + */ +static int ext4_flex_group_add(struct super_block *sb, + struct inode *resize_inode, + struct ext4_new_flex_group_data *flex_gd) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + ext4_fsblk_t o_blocks_count; + ext4_grpblk_t last; + ext4_group_t group; + handle_t *handle; + unsigned reserved_gdb; + int err = 0, err2 = 0, credit; + + BUG_ON(!flex_gd->count || !flex_gd->groups || !flex_gd->bg_flags); + + reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks); + o_blocks_count = ext4_blocks_count(es); + ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); + BUG_ON(last); + + err = setup_new_flex_group_blocks(sb, flex_gd); + if (err) + goto exit; + /* + * We will always be modifying at least the superblock and GDT + * block. If we are adding a group past the last current GDT block, + * we will also modify the inode and the dindirect block. If we + * are adding a group with superblock/GDT backups we will also + * modify each of the reserved GDT dindirect blocks. + */ + credit = flex_gd->count * 4 + reserved_gdb; + handle = ext4_journal_start_sb(sb, credit); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto exit; + } + + err = ext4_journal_get_write_access(handle, sbi->s_sbh); + if (err) + goto exit_journal; + + group = flex_gd->groups[0].group; + BUG_ON(group != EXT4_SB(sb)->s_groups_count); + err = ext4_add_new_descs(handle, sb, group, + resize_inode, flex_gd->count); + if (err) + goto exit_journal; + + err = ext4_setup_new_descs(handle, sb, flex_gd); + if (err) + goto exit_journal; + + ext4_update_super(sb, flex_gd); + + err = ext4_handle_dirty_super(handle, sb); + +exit_journal: + err2 = ext4_journal_stop(handle); + if (!err) + err = err2; + + if (!err) { + int i; + update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, + sizeof(struct ext4_super_block)); + for (i = 0; i < flex_gd->count; i++, group++) { + struct buffer_head *gdb_bh; + int gdb_num; + gdb_num = group / EXT4_BLOCKS_PER_GROUP(sb); + gdb_bh = sbi->s_group_desc[gdb_num]; + update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data, + gdb_bh->b_size); + } + } +exit: + return err; +} + +static int ext4_setup_next_flex_gd(struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd, + ext4_fsblk_t n_blocks_count, + unsigned long flexbg_size) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct ext4_new_group_data *group_data = flex_gd->groups; + ext4_fsblk_t o_blocks_count; + ext4_group_t n_group; + ext4_group_t group; + ext4_group_t last_group; + ext4_grpblk_t last; + ext4_grpblk_t blocks_per_group; + unsigned long i; + + blocks_per_group = EXT4_BLOCKS_PER_GROUP(sb); + + o_blocks_count = ext4_blocks_count(es); + + if (o_blocks_count == n_blocks_count) + return 0; + + ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); + BUG_ON(last); + ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &last); + + last_group = group | (flexbg_size - 1); + if (last_group > n_group) + last_group = n_group; + + flex_gd->count = last_group - group + 1; + + for (i = 0; i < flex_gd->count; i++) { + int overhead; + + group_data[i].group = group + i; + group_data[i].blocks_count = blocks_per_group; + overhead = ext4_bg_has_super(sb, group + i) ? + (1 + ext4_bg_num_gdb(sb, group + i) + + le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; + group_data[i].free_blocks_count = blocks_per_group - overhead; + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) + flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT | + EXT4_BG_INODE_UNINIT; + else + flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED; + } + + if (last_group == n_group && + EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) + /* We need to initialize block bitmap of last group. */ + flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT; + + if ((last_group == n_group) && (last != blocks_per_group - 1)) { + group_data[i - 1].blocks_count = last + 1; + group_data[i - 1].free_blocks_count -= blocks_per_group- + last - 1; + } + + return 1; +} + /* Add group descriptor data to an existing or new group descriptor block. * Ensure we handle all possible error conditions _before_ we start modifying * the filesystem, because we cannot abort the transaction and not have it @@ -750,16 +1378,15 @@ exit_err: */ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) { + struct ext4_new_flex_group_data flex_gd; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int reserved_gdb = ext4_bg_has_super(sb, input->group) ? le16_to_cpu(es->s_reserved_gdt_blocks) : 0; - struct buffer_head *primary = NULL; - struct ext4_group_desc *gdp; struct inode *inode = NULL; - handle_t *handle; int gdb_off, gdb_num; - int err, err2; + int err; + __u16 bg_flags = 0; gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb); @@ -798,175 +1425,69 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) } - if ((err = verify_group_input(sb, input))) - goto exit_put; + err = verify_group_input(sb, input); + if (err) + goto out; - if ((err = setup_new_group_blocks(sb, input))) - goto exit_put; + flex_gd.count = 1; + flex_gd.groups = input; + flex_gd.bg_flags = &bg_flags; + err = ext4_flex_group_add(sb, inode, &flex_gd); +out: + iput(inode); + return err; +} /* ext4_group_add */ - /* - * We will always be modifying at least the superblock and a GDT - * block. If we are adding a group past the last current GDT block, - * we will also modify the inode and the dindirect block. If we - * are adding a group with superblock/GDT backups we will also - * modify each of the reserved GDT dindirect blocks. +/* + * extend a group without checking assuming that checking has been done. + */ +static int ext4_group_extend_no_check(struct super_block *sb, + ext4_fsblk_t o_blocks_count, ext4_grpblk_t add) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + handle_t *handle; + int err = 0, err2; + + /* We will update the superblock, one block bitmap, and + * one group descriptor via ext4_group_add_blocks(). */ - handle = ext4_journal_start_sb(sb, - ext4_bg_has_super(sb, input->group) ? - 3 + reserved_gdb : 4); + handle = ext4_journal_start_sb(sb, 3); if (IS_ERR(handle)) { err = PTR_ERR(handle); - goto exit_put; + ext4_warning(sb, "error %d on journal start", err); + return err; } - if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh))) - goto exit_journal; - - /* - * We will only either add reserved group blocks to a backup group - * or remove reserved blocks for the first group in a new group block. - * Doing both would be mean more complex code, and sane people don't - * use non-sparse filesystems anymore. This is already checked above. - */ - if (gdb_off) { - primary = sbi->s_group_desc[gdb_num]; - if ((err = ext4_journal_get_write_access(handle, primary))) - goto exit_journal; - - if (reserved_gdb && ext4_bg_num_gdb(sb, input->group)) { - err = reserve_backup_gdb(handle, inode, input->group); - if (err) - goto exit_journal; - } - } else { - /* - * Note that we can access new group descriptor block safely - * only if add_new_gdb() succeeds. - */ - err = add_new_gdb(handle, inode, input->group); - if (err) - goto exit_journal; - primary = sbi->s_group_desc[gdb_num]; + err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + if (err) { + ext4_warning(sb, "error %d on journal write access", err); + goto errout; } - /* - * OK, now we've set up the new group. Time to make it active. - * - * so we have to be safe wrt. concurrent accesses the group - * data. So we need to be careful to set all of the relevant - * group descriptor data etc. *before* we enable the group. - * - * The key field here is sbi->s_groups_count: as long as - * that retains its old value, nobody is going to access the new - * group. - * - * So first we update all the descriptor metadata for the new - * group; then we update the total disk blocks count; then we - * update the groups count to enable the group; then finally we - * update the free space counts so that the system can start - * using the new disk blocks. - */ - - /* Update group descriptor block for new group */ - gdp = (struct ext4_group_desc *)((char *)primary->b_data + - gdb_off * EXT4_DESC_SIZE(sb)); - - memset(gdp, 0, EXT4_DESC_SIZE(sb)); - ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */ - ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */ - ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */ - ext4_free_group_clusters_set(sb, gdp, input->free_blocks_count); - ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); - gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED); - gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); - - /* - * We can allocate memory for mb_alloc based on the new group - * descriptor - */ - err = ext4_mb_add_groupinfo(sb, input->group, gdp); + ext4_blocks_count_set(es, o_blocks_count + add); + ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, + o_blocks_count + add); + /* We add the blocks to the bitmap and set the group need init bit */ + err = ext4_group_add_blocks(handle, sb, o_blocks_count, add); if (err) - goto exit_journal; - - /* - * Make the new blocks and inodes valid next. We do this before - * increasing the group count so that once the group is enabled, - * all of its blocks and inodes are already valid. - * - * We always allocate group-by-group, then block-by-block or - * inode-by-inode within a group, so enabling these - * blocks/inodes before the group is live won't actually let us - * allocate the new space yet. - */ - ext4_blocks_count_set(es, ext4_blocks_count(es) + - input->blocks_count); - le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb)); - - /* - * We need to protect s_groups_count against other CPUs seeing - * inconsistent state in the superblock. - * - * The precise rules we use are: - * - * * Writers must perform a smp_wmb() after updating all dependent - * data and before modifying the groups count - * - * * Readers must perform an smp_rmb() after reading the groups count - * and before reading any dependent data. - * - * NB. These rules can be relaxed when checking the group count - * while freeing data, as we can only allocate from a block - * group after serialising against the group count, and we can - * only then free after serialising in turn against that - * allocation. - */ - smp_wmb(); - - /* Update the global fs size fields */ - sbi->s_groups_count++; - - err = ext4_handle_dirty_metadata(handle, NULL, primary); - if (unlikely(err)) { - ext4_std_error(sb, err); - goto exit_journal; - } - - /* Update the reserved block counts only once the new group is - * active. */ - ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) + - input->reserved_blocks); - - /* Update the free space counts */ - percpu_counter_add(&sbi->s_freeclusters_counter, - EXT4_B2C(sbi, input->free_blocks_count)); - percpu_counter_add(&sbi->s_freeinodes_counter, - EXT4_INODES_PER_GROUP(sb)); - - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) && - sbi->s_log_groups_per_flex) { - ext4_group_t flex_group; - flex_group = ext4_flex_group(sbi, input->group); - atomic_add(EXT4_B2C(sbi, input->free_blocks_count), - &sbi->s_flex_groups[flex_group].free_clusters); - atomic_add(EXT4_INODES_PER_GROUP(sb), - &sbi->s_flex_groups[flex_group].free_inodes); - } - + goto errout; ext4_handle_dirty_super(handle, sb); - -exit_journal: - if ((err2 = ext4_journal_stop(handle)) && !err) + ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, + o_blocks_count + add); +errout: + err2 = ext4_journal_stop(handle); + if (err2 && !err) err = err2; - if (!err && primary) { - update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, + + if (!err) { + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT4-fs: extended group to %llu " + "blocks\n", ext4_blocks_count(es)); + update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es, sizeof(struct ext4_super_block)); - update_backups(sb, primary->b_blocknr, primary->b_data, - primary->b_size); } -exit_put: - iput(inode); return err; -} /* ext4_group_add */ +} /* * Extend the filesystem to the new number of blocks specified. This entry @@ -985,8 +1506,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_grpblk_t last; ext4_grpblk_t add; struct buffer_head *bh; - handle_t *handle; - int err, err2; + int err; ext4_group_t group; o_blocks_count = ext4_blocks_count(es); @@ -1042,42 +1562,119 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, } brelse(bh); - /* We will update the superblock, one block bitmap, and - * one group descriptor via ext4_free_blocks(). - */ - handle = ext4_journal_start_sb(sb, 3); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - ext4_warning(sb, "error %d on journal start", err); - goto exit_put; + err = ext4_group_extend_no_check(sb, o_blocks_count, add); + return err; +} /* ext4_group_extend */ + +/* + * ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count + * + * @sb: super block of the fs to be resized + * @n_blocks_count: the number of blocks resides in the resized fs + */ +int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) +{ + struct ext4_new_flex_group_data *flex_gd = NULL; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + struct buffer_head *bh; + struct inode *resize_inode; + ext4_fsblk_t o_blocks_count; + ext4_group_t o_group; + ext4_group_t n_group; + ext4_grpblk_t offset; + unsigned long n_desc_blocks; + unsigned long o_desc_blocks; + unsigned long desc_blocks; + int err = 0, flexbg_size = 1; + + o_blocks_count = ext4_blocks_count(es); + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT4-fs: resizing filesystem from %llu " + "upto %llu blocks\n", o_blocks_count, n_blocks_count); + + if (n_blocks_count < o_blocks_count) { + /* On-line shrinking not supported */ + ext4_warning(sb, "can't shrink FS - resize aborted"); + return -EINVAL; } - if ((err = ext4_journal_get_write_access(handle, - EXT4_SB(sb)->s_sbh))) { - ext4_warning(sb, "error %d on journal write access", err); - ext4_journal_stop(handle); - goto exit_put; + if (n_blocks_count == o_blocks_count) + /* Nothing need to do */ + return 0; + + ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset); + ext4_get_group_no_and_offset(sb, o_blocks_count, &o_group, &offset); + + n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) / + EXT4_DESC_PER_BLOCK(sb); + o_desc_blocks = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / + EXT4_DESC_PER_BLOCK(sb); + desc_blocks = n_desc_blocks - o_desc_blocks; + + if (desc_blocks && + (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE) || + le16_to_cpu(es->s_reserved_gdt_blocks) < desc_blocks)) { + ext4_warning(sb, "No reserved GDT blocks, can't resize"); + return -EPERM; } - ext4_blocks_count_set(es, o_blocks_count + add); - ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, - o_blocks_count + add); - /* We add the blocks to the bitmap and set the group need init bit */ - err = ext4_group_add_blocks(handle, sb, o_blocks_count, add); - ext4_handle_dirty_super(handle, sb); - ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, - o_blocks_count + add); - err2 = ext4_journal_stop(handle); - if (!err && err2) - err = err2; - if (err) - goto exit_put; + resize_inode = ext4_iget(sb, EXT4_RESIZE_INO); + if (IS_ERR(resize_inode)) { + ext4_warning(sb, "Error opening resize inode"); + return PTR_ERR(resize_inode); + } + /* See if the device is actually as big as what was requested */ + bh = sb_bread(sb, n_blocks_count - 1); + if (!bh) { + ext4_warning(sb, "can't read last block, resize aborted"); + return -ENOSPC; + } + brelse(bh); + + if (offset != 0) { + /* extend the last group */ + ext4_grpblk_t add; + add = EXT4_BLOCKS_PER_GROUP(sb) - offset; + err = ext4_group_extend_no_check(sb, o_blocks_count, add); + if (err) + goto out; + } + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) && + es->s_log_groups_per_flex) + flexbg_size = 1 << es->s_log_groups_per_flex; + + o_blocks_count = ext4_blocks_count(es); + if (o_blocks_count == n_blocks_count) + goto out; + + flex_gd = alloc_flex_gd(flexbg_size); + if (flex_gd == NULL) { + err = -ENOMEM; + goto out; + } + + /* Add flex groups. Note that a regular group is a + * flex group with 1 group. + */ + while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count, + flexbg_size)) { + ext4_alloc_group_tables(sb, flex_gd, flexbg_size); + err = ext4_flex_group_add(sb, resize_inode, flex_gd); + if (unlikely(err)) + break; + } + +out: + if (flex_gd) + free_flex_gd(flex_gd); + + iput(resize_inode); if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n", - ext4_blocks_count(es)); - update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es, - sizeof(struct ext4_super_block)); -exit_put: + printk(KERN_DEBUG "EXT4-fs: resized filesystem from %llu " + "upto %llu blocks\n", o_blocks_count, n_blocks_count); return err; -} /* ext4_group_extend */ +} diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ed3ce82e2de4..502c61fd7392 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1095,7 +1095,7 @@ static int ext4_show_options(struct seq_file *seq, struct dentry *root) } if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) { seq_printf(seq, ",max_batch_time=%u", - (unsigned) sbi->s_min_batch_time); + (unsigned) sbi->s_max_batch_time); } /* @@ -2005,17 +2005,16 @@ static int ext4_fill_flex_info(struct super_block *sb) struct ext4_group_desc *gdp = NULL; ext4_group_t flex_group_count; ext4_group_t flex_group; - int groups_per_flex = 0; + unsigned int groups_per_flex = 0; size_t size; int i; sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; - groups_per_flex = 1 << sbi->s_log_groups_per_flex; - - if (groups_per_flex < 2) { + if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) { sbi->s_log_groups_per_flex = 0; return 1; } + groups_per_flex = 1 << sbi->s_log_groups_per_flex; /* We allocate both existing and potentially added groups */ flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + @@ -3506,7 +3505,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * of the filesystem. */ if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { - ext4_msg(sb, KERN_WARNING, "bad geometry: first data" + ext4_msg(sb, KERN_WARNING, "bad geometry: first data " "block %u is beyond end of filesystem (%llu)", le32_to_cpu(es->s_first_data_block), ext4_blocks_count(es)); diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index b60f9f81e33c..d2a200624af5 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -47,8 +47,9 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name, name, value, size, flags); } -int ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array, - void *fs_info) +static int +ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { const struct xattr *xattr; handle_t *handle = fs_info; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 68d704db787f..5069b8475150 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -430,6 +430,12 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd_debug(3, "JBD2: commit phase 1\n"); /* + * Clear revoked flag to reflect there is no revoked buffers + * in the next transaction which is going to be started. + */ + jbd2_clear_buffer_revoked_flags(journal); + + /* * Switch to a new revoke table. */ jbd2_journal_switch_revoke_table(journal); diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 69fd93588118..30b2867d6cc9 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -47,6 +47,10 @@ * overwriting the new data. We don't even need to clear the revoke * bit here. * + * We cache revoke status of a buffer in the current transaction in b_states + * bits. As the name says, revokevalid flag indicates that the cached revoke + * status of a buffer is valid and we can rely on the cached status. + * * Revoke information on buffers is a tri-state value: * * RevokeValid clear: no cached revoke status, need to look it up @@ -478,6 +482,36 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) return did_revoke; } +/* + * journal_clear_revoked_flag clears revoked flag of buffers in + * revoke table to reflect there is no revoked buffers in the next + * transaction which is going to be started. + */ +void jbd2_clear_buffer_revoked_flags(journal_t *journal) +{ + struct jbd2_revoke_table_s *revoke = journal->j_revoke; + int i = 0; + + for (i = 0; i < revoke->hash_size; i++) { + struct list_head *hash_list; + struct list_head *list_entry; + hash_list = &revoke->hash_table[i]; + + list_for_each(list_entry, hash_list) { + struct jbd2_revoke_record_s *record; + struct buffer_head *bh; + record = (struct jbd2_revoke_record_s *)list_entry; + bh = __find_get_block(journal->j_fs_dev, + record->blocknr, + journal->j_blocksize); + if (bh) { + clear_buffer_revoked(bh); + __brelse(bh); + } + } + } +} + /* journal_switch_revoke table select j_revoke for next transaction * we do not want to suspend any processing until all revokes are * written -bzzz diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index a0e41a4c080e..35ae096bed5d 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -517,12 +517,13 @@ void jbd2_journal_lock_updates(journal_t *journal) break; spin_lock(&transaction->t_handle_lock); + prepare_to_wait(&journal->j_wait_updates, &wait, + TASK_UNINTERRUPTIBLE); if (!atomic_read(&transaction->t_updates)) { spin_unlock(&transaction->t_handle_lock); + finish_wait(&journal->j_wait_updates, &wait); break; } - prepare_to_wait(&journal->j_wait_updates, &wait, - TASK_UNINTERRUPTIBLE); spin_unlock(&transaction->t_handle_lock); write_unlock(&journal->j_state_lock); schedule(); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 2092ea21e469..5557baefed60 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1151,6 +1151,7 @@ extern int jbd2_journal_set_revoke(journal_t *, unsigned long long, tid_t); extern int jbd2_journal_test_revoke(journal_t *, unsigned long long, tid_t); extern void jbd2_journal_clear_revoke(journal_t *); extern void jbd2_journal_switch_revoke_table(journal_t *journal); +extern void jbd2_clear_buffer_revoked_flags(journal_t *journal); /* * The log thread user interface: diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 748ff7cbe555..319538bf17d2 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -573,9 +573,9 @@ TRACE_EVENT(ext4_mb_release_inode_pa, ); TRACE_EVENT(ext4_mb_release_group_pa, - TP_PROTO(struct ext4_prealloc_space *pa), + TP_PROTO(struct super_block *sb, struct ext4_prealloc_space *pa), - TP_ARGS(pa), + TP_ARGS(sb, pa), TP_STRUCT__entry( __field( dev_t, dev ) @@ -585,7 +585,7 @@ TRACE_EVENT(ext4_mb_release_group_pa, ), TP_fast_assign( - __entry->dev = pa->pa_inode->i_sb->s_dev; + __entry->dev = sb->s_dev; __entry->pa_pstart = pa->pa_pstart; __entry->pa_len = pa->pa_len; ), |