From 5e1f8c9e20a92743eefc9a82c2db835213905e26 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 28 Oct 2008 13:21:55 -0400 Subject: ext3: Add support for non-native signed/unsigned htree hash algorithms The original ext3 hash algorithms assumed that variables of type char were signed, as God and K&R intended. Unfortunately, this assumption is not true on some architectures. Userspace support for marking filesystems with non-native signed/unsigned chars was added two years ago, but the kernel-side support was never added (until now). Signed-off-by: "Theodore Ts'o" Cc: akpm@linux-foundation.org Cc: linux-kernel@vger.kernel.org --- fs/ext3/hash.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++-------- fs/ext3/namei.c | 7 ++++++ fs/ext3/super.c | 12 +++++++++ 3 files changed, 86 insertions(+), 10 deletions(-) (limited to 'fs/ext3') diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c index c30e149fbd2e..7d215b4d4f2e 100644 --- a/fs/ext3/hash.c +++ b/fs/ext3/hash.c @@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[]) /* The old legacy hash */ -static __u32 dx_hack_hash (const char *name, int len) +static __u32 dx_hack_hash_unsigned(const char *name, int len) { - __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + const unsigned char *ucp = (const unsigned char *) name; + + while (len--) { + hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373)); + + if (hash & 0x80000000) + hash -= 0x7fffffff; + hash1 = hash0; + hash0 = hash; + } + return hash0 << 1; +} + +static __u32 dx_hack_hash_signed(const char *name, int len) +{ + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + const signed char *scp = (const signed char *) name; + while (len--) { - __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); + hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373)); - if (hash & 0x80000000) hash -= 0x7fffffff; + if (hash & 0x80000000) + hash -= 0x7fffffff; hash1 = hash0; hash0 = hash; } - return (hash0 << 1); + return hash0 << 1; } -static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) +static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num) { __u32 pad, val; int i; + const signed char *scp = (const signed char *) msg; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num*4) + len = num * 4; + for (i = 0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = ((int) scp[i]) + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num) +{ + __u32 pad, val; + int i; + const unsigned char *ucp = (const unsigned char *) msg; pad = (__u32)len | ((__u32)len << 8); pad |= pad << 16; @@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) for (i=0; i < len; i++) { if ((i % 4) == 0) val = pad; - val = msg[i] + (val << 8); + val = ((int) ucp[i]) + (val << 8); if ((i % 4) == 3) { *buf++ = val; val = pad; @@ -95,6 +143,8 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) const char *p; int i; __u32 in[8], buf[4]; + void (*str2hashbuf)(const char *, int, __u32 *, int) = + str2hashbuf_signed; /* Initialize the default seed for the hash checksum functions */ buf[0] = 0x67452301; @@ -113,13 +163,18 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) } switch (hinfo->hash_version) { + case DX_HASH_LEGACY_UNSIGNED: + hash = dx_hack_hash_unsigned(name, len); + break; case DX_HASH_LEGACY: - hash = dx_hack_hash(name, len); + hash = dx_hack_hash_signed(name, len); break; + case DX_HASH_HALF_MD4_UNSIGNED: + str2hashbuf = str2hashbuf_unsigned; case DX_HASH_HALF_MD4: p = name; while (len > 0) { - str2hashbuf(p, len, in, 8); + (*str2hashbuf)(p, len, in, 8); half_md4_transform(buf, in); len -= 32; p += 32; @@ -127,10 +182,12 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) minor_hash = buf[2]; hash = buf[1]; break; + case DX_HASH_TEA_UNSIGNED: + str2hashbuf = str2hashbuf_unsigned; case DX_HASH_TEA: p = name; while (len > 0) { - str2hashbuf(p, len, in, 4); + (*str2hashbuf)(p, len, in, 4); TEA_transform(buf, in); len -= 16; p += 16; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 1dd2abe6313e..287b304d42a7 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -368,6 +368,8 @@ dx_probe(struct qstr *entry, struct inode *dir, goto fail; } hinfo->hash_version = root->info.hash_version; + if (hinfo->hash_version <= DX_HASH_TEA) + hinfo->hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned; hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed; if (entry) ext3fs_dirhash(entry->name, entry->len, hinfo); @@ -636,6 +638,9 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, dir = dir_file->f_path.dentry->d_inode; if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) { hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += + EXT3_SB(dir->i_sb)->s_hash_unsigned; hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, start_hash, start_minor_hash); @@ -1398,6 +1403,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, /* Initialize as for dx_probe */ hinfo.hash_version = root->info.hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned; hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; ext3fs_dirhash(name, namelen, &hinfo); frame = frames; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index f6c94f232ec1..541d5e4f7f6e 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1744,6 +1744,18 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) for (i=0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); sbi->s_def_hash_version = es->s_def_hash_version; + i = le32_to_cpu(es->s_flags); + if (i & EXT2_FLAGS_UNSIGNED_HASH) + sbi->s_hash_unsigned = 3; + else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { +#ifdef __CHAR_UNSIGNED__ + es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); + sbi->s_hash_unsigned = 3; +#else + es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); +#endif + sb->s_dirt = 1; + } if (sbi->s_blocks_per_group > blocksize * 8) { printk (KERN_ERR -- cgit v1.2.3 From 59e315b4c410b00a9acd0f24a00dbadbe81ce692 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 6 Dec 2008 16:58:39 -0500 Subject: ext3/4: Fix loop index in do_split() so it is signed This fixes a gcc warning but it doesn't appear able to result in a failure, since the primary way the loop is exited is the first conditional in the for loop, and at least for a consistent filesystem, the signed/unsigned should in practice never be exposed. Signed-off-by: Roel Kluin Signed-off-by: "Theodore Ts'o" --- fs/ext3/namei.c | 4 ++-- fs/ext4/namei.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/ext3') diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 287b304d42a7..2c2d700c1ccf 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1161,9 +1161,9 @@ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, u32 hash2; struct dx_map_entry *map; char *data1 = (*bh)->b_data, *data2; - unsigned split, move, size, i; + unsigned split, move, size; struct ext3_dir_entry_2 *de = NULL, *de2; - int err = 0; + int err = 0, i; bh2 = ext3_append (handle, dir, &newblock, &err); if (!(bh2)) { diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 315858db8078..84a68ae623c1 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1171,9 +1171,9 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, u32 hash2; struct dx_map_entry *map; char *data1 = (*bh)->b_data, *data2; - unsigned split, move, size, i; + unsigned split, move, size; struct ext4_dir_entry_2 *de = NULL, *de2; - int err = 0; + int err = 0, i; bh2 = ext4_append (handle, dir, &newblock, &err); if (!(bh2)) { -- cgit v1.2.3 From 6b082b531228c43d454c082fc0f969da1695b060 Mon Sep 17 00:00:00 2001 From: Toshiyuki Okajima Date: Mon, 5 Jan 2009 22:38:14 -0500 Subject: ext3: provide function to release metadata pages under memory pressure Pages in the page cache belonging to ext3 data files are released via the ext3_releasepage() function specified in the ext3 inode's address_space_ops. However, metadata blocks (such as indirect blocks, directory blocks, etc) are managed via the block device address_space_ops, and they can not be released by try_to_free_buffers() if they have a journal head attached to them. To address this, we supply a try_to_free_pages() function which calls journal_try_to_free_buffers() function to free the metadata, and which is called by the block device's blkdev_releasepage() function. Signed-off-by: Toshiyuki Okajima Signed-off-by: "Theodore Ts'o" Cc: linux-fsdevel@vger.kernel.org --- fs/ext3/super.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'fs/ext3') diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 541d5e4f7f6e..6900ff05e3ab 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -682,6 +682,26 @@ static struct dentry *ext3_fh_to_parent(struct super_block *sb, struct fid *fid, ext3_nfs_get_inode); } +/* + * Try to release metadata pages (indirect blocks, directories) which are + * mapped via the block device. Since these pages could have journal heads + * which would prevent try_to_free_buffers() from freeing them, we must use + * jbd layer's try_to_free_buffers() function to release them. + */ +static int bdev_try_to_free_page(struct super_block *sb, struct page *page, + gfp_t wait) +{ + journal_t *journal = EXT3_SB(sb)->s_journal; + + WARN_ON(PageChecked(page)); + if (!page_has_buffers(page)) + return 0; + if (journal) + return journal_try_to_free_buffers(journal, page, + wait & ~__GFP_WAIT); + return try_to_free_buffers(page); +} + #ifdef CONFIG_QUOTA #define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group") #define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) @@ -746,6 +766,7 @@ static const struct super_operations ext3_sops = { .quota_read = ext3_quota_read, .quota_write = ext3_quota_write, #endif + .bdev_try_to_free_page = bdev_try_to_free_page, }; static const struct export_operations ext3_export_ops = { -- cgit v1.2.3 From 5df096d67ec2b6578518caed7d57317a4b807aa1 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Wed, 7 Jan 2009 18:07:25 -0800 Subject: ext3: allocate ->s_blockgroup_lock separately As spotted by kmemtrace, struct ext3_sb_info is 17152 bytes on 64-bit which makes it a very bad fit for SLAB allocators. The culprit of the wasted memory is ->s_blockgroup_lock which can be as big as 16 KB when NR_CPUS >= 32. To fix that, allocate ->s_blockgroup_lock, which fits nicely in a order 2 page in the worst case, separately. This shinks down struct ext3_sb_info enough to fit a 1 KB slab cache so now we allocate 16 KB + 1 KB instead of 32 KB saving 15 KB of memory. Acked-by: Andreas Dilger Signed-off-by: Pekka Enberg Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/super.c | 10 +++++++++- include/linux/ext3_fs_sb.h | 4 ++-- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'fs/ext3') diff --git a/fs/ext3/super.c b/fs/ext3/super.c index c22d01467bd1..01c235bc2054 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -439,6 +439,7 @@ static void ext3_put_super (struct super_block * sb) ext3_blkdev_remove(sbi); } sb->s_fs_info = NULL; + kfree(sbi->s_blockgroup_lock); kfree(sbi); return; } @@ -1546,6 +1547,13 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; + + sbi->s_blockgroup_lock = + kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); + if (!sbi->s_blockgroup_lock) { + kfree(sbi); + return -ENOMEM; + } sb->s_fs_info = sbi; sbi->s_mount_opt = 0; sbi->s_resuid = EXT3_DEF_RESUID; @@ -1786,7 +1794,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount; } - bgl_lock_init(&sbi->s_blockgroup_lock); + bgl_lock_init(sbi->s_blockgroup_lock); for (i = 0; i < db_count; i++) { block = descriptor_loc(sb, logic_sb_block, i); diff --git a/include/linux/ext3_fs_sb.h b/include/linux/ext3_fs_sb.h index e024e38248ff..76fdc0f4b028 100644 --- a/include/linux/ext3_fs_sb.h +++ b/include/linux/ext3_fs_sb.h @@ -60,7 +60,7 @@ struct ext3_sb_info { struct percpu_counter s_freeblocks_counter; struct percpu_counter s_freeinodes_counter; struct percpu_counter s_dirs_counter; - struct blockgroup_lock s_blockgroup_lock; + struct blockgroup_lock *s_blockgroup_lock; /* root of the per fs reservation window tree */ spinlock_t s_rsv_window_lock; @@ -86,7 +86,7 @@ struct ext3_sb_info { static inline spinlock_t * sb_bgl_lock(struct ext3_sb_info *sbi, unsigned int block_group) { - return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group); + return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group); } #endif /* _LINUX_EXT3_FS_SB */ -- cgit v1.2.3 From 2e8671cb566da993425d324fc355af31edc6e7f1 Mon Sep 17 00:00:00 2001 From: Duane Griffin Date: Wed, 7 Jan 2009 18:07:26 -0800 Subject: ext3: don't inherit inappropriate inode flags from parent At present INDEX is the only flag that new ext3 inodes do NOT inherit from their parent. In addition prevent the flags DIRTY, ECOMPR, IMAGIC and TOPDIR from being inherited. List inheritable flags explicitly to prevent future flags from accidentally being inherited. This fixes the TOPDIR flag inheritance bug reported at http://bugzilla.kernel.org/show_bug.cgi?id=9866. Signed-off-by: Duane Griffin Acked-by: Andreas Dilger Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/ialloc.c | 2 +- include/linux/ext3_fs.h | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'fs/ext3') diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index 5655fbcbd11f..ba9186a21d1e 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -559,7 +559,7 @@ got: ei->i_dir_start_lookup = 0; ei->i_disksize = 0; - ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL; + ei->i_flags = EXT3_I(dir)->i_flags & EXT3_FL_INHERITED; if (S_ISLNK(mode)) ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); /* dirsync only applies to directories */ diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h index d14f02918483..b745619a9b8e 100644 --- a/include/linux/ext3_fs.h +++ b/include/linux/ext3_fs.h @@ -178,6 +178,13 @@ struct ext3_group_desc #define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ +/* Flags that should be inherited by new inodes from their parent. */ +#define EXT3_FL_INHERITED (EXT3_SECRM_FL | EXT3_UNRM_FL | EXT3_COMPR_FL |\ + EXT3_SYNC_FL | EXT3_IMMUTABLE_FL | EXT3_APPEND_FL |\ + EXT3_NODUMP_FL | EXT3_NOATIME_FL | EXT3_COMPRBLK_FL|\ + EXT3_NOCOMPR_FL | EXT3_JOURNAL_DATA_FL |\ + EXT3_NOTAIL_FL | EXT3_DIRSYNC_FL) + /* * Inode dynamic state flags */ -- cgit v1.2.3 From 04143e2fb9d512c21e1dcfb561dbb0445dcfdc8c Mon Sep 17 00:00:00 2001 From: Duane Griffin Date: Wed, 7 Jan 2009 18:07:26 -0800 Subject: ext3: tighten restrictions on inode flags At the moment there are few restrictions on which flags may be set on which inodes. Specifically DIRSYNC may only be set on directories and IMMUTABLE and APPEND may not be set on links. Tighten that to disallow TOPDIR being set on non-directories and only NODUMP and NOATIME to be set on non-regular file, non-directories. Introduces a flags masking function which masks flags based on mode and use it during inode creation and when flags are set via the ioctl to facilitate future consistency. Signed-off-by: Duane Griffin Acked-by: Andreas Dilger Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/ialloc.c | 8 ++------ fs/ext3/ioctl.c | 3 +-- include/linux/ext3_fs.h | 17 +++++++++++++++++ 3 files changed, 20 insertions(+), 8 deletions(-) (limited to 'fs/ext3') diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index ba9186a21d1e..8de6c720e510 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -559,12 +559,8 @@ got: ei->i_dir_start_lookup = 0; ei->i_disksize = 0; - ei->i_flags = EXT3_I(dir)->i_flags & EXT3_FL_INHERITED; - if (S_ISLNK(mode)) - ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); - /* dirsync only applies to directories */ - if (!S_ISDIR(mode)) - ei->i_flags &= ~EXT3_DIRSYNC_FL; + ei->i_flags = + ext3_mask_flags(mode, EXT3_I(dir)->i_flags & EXT3_FL_INHERITED); #ifdef EXT3_FRAGMENTS ei->i_faddr = 0; ei->i_frag_no = 0; diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index b7394d05ee8e..5e86ce9a86e0 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -53,8 +53,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, goto flags_out; } - if (!S_ISDIR(inode->i_mode)) - flags &= ~EXT3_DIRSYNC_FL; + flags = ext3_mask_flags(inode->i_mode, flags); mutex_lock(&inode->i_mutex); /* Is it quota file? Do not allow user to mess with it */ diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h index b745619a9b8e..d76800f6ecf0 100644 --- a/include/linux/ext3_fs.h +++ b/include/linux/ext3_fs.h @@ -185,6 +185,23 @@ struct ext3_group_desc EXT3_NOCOMPR_FL | EXT3_JOURNAL_DATA_FL |\ EXT3_NOTAIL_FL | EXT3_DIRSYNC_FL) +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define EXT3_REG_FLMASK (~(EXT3_DIRSYNC_FL | EXT3_TOPDIR_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define EXT3_OTHER_FLMASK (EXT3_NODUMP_FL | EXT3_NOATIME_FL) + +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __u32 ext3_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & EXT3_REG_FLMASK; + else + return flags & EXT3_OTHER_FLMASK; +} + /* * Inode dynamic state flags */ -- cgit v1.2.3 From be857df1dd8d8e1491e60d999caf3b8446ccd475 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 7 Jan 2009 18:09:12 -0800 Subject: generic swap(): ext3: remove local swap() macro Use the new generic implementation. Signed-off-by: Wu Fengguang Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/namei.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs/ext3') diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 1dd2abe6313e..8d6f965e502c 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -74,10 +74,6 @@ static struct buffer_head *ext3_append(handle_t *handle, #define assert(test) J_ASSERT(test) #endif -#ifndef swap -#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0) -#endif - #ifdef DX_DEBUG #define dxtrace(command) command #else -- cgit v1.2.3 From c4be0c1dc4cdc37b175579be1460f15ac6495e9a Mon Sep 17 00:00:00 2001 From: Takashi Sato Date: Fri, 9 Jan 2009 16:40:58 -0800 Subject: filesystem freeze: add error handling of write_super_lockfs/unlockfs Currently, ext3 in mainline Linux doesn't have the freeze feature which suspends write requests. So, we cannot take a backup which keeps the filesystem's consistency with the storage device's features (snapshot and replication) while it is mounted. In many case, a commercial filesystem (e.g. VxFS) has the freeze feature and it would be used to get the consistent backup. If Linux's standard filesystem ext3 has the freeze feature, we can do it without a commercial filesystem. So I have implemented the ioctls of the freeze feature. I think we can take the consistent backup with the following steps. 1. Freeze the filesystem with the freeze ioctl. 2. Separate the replication volume or create the snapshot with the storage device's feature. 3. Unfreeze the filesystem with the unfreeze ioctl. 4. Take the backup from the separated replication volume or the snapshot. This patch: VFS: Changed the type of write_super_lockfs and unlockfs from "void" to "int" so that they can return an error. Rename write_super_lockfs and unlockfs of the super block operation freeze_fs and unfreeze_fs to avoid a confusion. ext3, ext4, xfs, gfs2, jfs: Changed the type of write_super_lockfs and unlockfs from "void" to "int" so that write_super_lockfs returns an error if needed, and unlockfs always returns 0. reiserfs: Changed the type of write_super_lockfs and unlockfs from "void" to "int" so that they always return 0 (success) to keep a current behavior. Signed-off-by: Takashi Sato Signed-off-by: Masayuki Hamaguchi Cc: Cc: Cc: Christoph Hellwig Cc: Dave Kleikamp Cc: Dave Chinner Cc: Alasdair G Kergon Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/Locking | 8 +++---- Documentation/filesystems/vfs.txt | 8 +++---- fs/buffer.c | 8 +++---- fs/ext3/super.c | 45 +++++++++++++++++++++++++-------------- fs/ext4/super.c | 45 +++++++++++++++++++++++++++------------ fs/gfs2/ops_super.c | 16 ++++++++------ fs/jfs/super.c | 10 +++++---- fs/reiserfs/super.c | 10 +++++---- fs/xfs/linux-2.6/xfs_super.c | 8 +++---- fs/xfs/xfs_fsops.c | 11 ++++++---- fs/xfs/xfs_fsops.h | 2 +- include/linux/fs.h | 4 ++-- 12 files changed, 107 insertions(+), 68 deletions(-) (limited to 'fs/ext3') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index cfbfa15a46ba..ec6a9392a173 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -97,8 +97,8 @@ prototypes: void (*put_super) (struct super_block *); void (*write_super) (struct super_block *); int (*sync_fs)(struct super_block *sb, int wait); - void (*write_super_lockfs) (struct super_block *); - void (*unlockfs) (struct super_block *); + int (*freeze_fs) (struct super_block *); + int (*unfreeze_fs) (struct super_block *); int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); void (*clear_inode) (struct inode *); @@ -119,8 +119,8 @@ delete_inode: no put_super: yes yes no write_super: no yes read sync_fs: no no read -write_super_lockfs: ? -unlockfs: ? +freeze_fs: ? +unfreeze_fs: ? statfs: no no no remount_fs: yes yes maybe (see below) clear_inode: no diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index ef19afa186a9..deeeed0faa8f 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -210,8 +210,8 @@ struct super_operations { void (*put_super) (struct super_block *); void (*write_super) (struct super_block *); int (*sync_fs)(struct super_block *sb, int wait); - void (*write_super_lockfs) (struct super_block *); - void (*unlockfs) (struct super_block *); + int (*freeze_fs) (struct super_block *); + int (*unfreeze_fs) (struct super_block *); int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); void (*clear_inode) (struct inode *); @@ -270,11 +270,11 @@ or bottom half). a superblock. The second parameter indicates whether the method should wait until the write out has been completed. Optional. - write_super_lockfs: called when VFS is locking a filesystem and + freeze_fs: called when VFS is locking a filesystem and forcing it into a consistent state. This method is currently used by the Logical Volume Manager (LVM). - unlockfs: called when VFS is unlocking a filesystem and making it writable + unfreeze_fs: called when VFS is unlocking a filesystem and making it writable again. statfs: called when the VFS needs to get filesystem statistics. This diff --git a/fs/buffer.c b/fs/buffer.c index c26da785938a..87f9e537b8c3 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -221,8 +221,8 @@ struct super_block *freeze_bdev(struct block_device *bdev) sync_blockdev(sb->s_bdev); - if (sb->s_op->write_super_lockfs) - sb->s_op->write_super_lockfs(sb); + if (sb->s_op->freeze_fs) + sb->s_op->freeze_fs(sb); } sync_blockdev(bdev); @@ -242,8 +242,8 @@ void thaw_bdev(struct block_device *bdev, struct super_block *sb) if (sb) { BUG_ON(sb->s_bdev != bdev); - if (sb->s_op->unlockfs) - sb->s_op->unlockfs(sb); + if (sb->s_op->unfreeze_fs) + sb->s_op->unfreeze_fs(sb); sb->s_frozen = SB_UNFROZEN; smp_wmb(); wake_up(&sb->s_wait_unfrozen); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 5d047a030a73..b70d90e08a3c 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -48,8 +48,8 @@ static int ext3_load_journal(struct super_block *, struct ext3_super_block *, unsigned long journal_devnum); static int ext3_create_journal(struct super_block *, struct ext3_super_block *, unsigned int); -static void ext3_commit_super (struct super_block * sb, - struct ext3_super_block * es, +static int ext3_commit_super(struct super_block *sb, + struct ext3_super_block *es, int sync); static void ext3_mark_recovery_complete(struct super_block * sb, struct ext3_super_block * es); @@ -60,9 +60,9 @@ static const char *ext3_decode_error(struct super_block * sb, int errno, char nbuf[16]); static int ext3_remount (struct super_block * sb, int * flags, char * data); static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf); -static void ext3_unlockfs(struct super_block *sb); +static int ext3_unfreeze(struct super_block *sb); static void ext3_write_super (struct super_block * sb); -static void ext3_write_super_lockfs(struct super_block *sb); +static int ext3_freeze(struct super_block *sb); /* * Wrappers for journal_start/end. @@ -759,8 +759,8 @@ static const struct super_operations ext3_sops = { .put_super = ext3_put_super, .write_super = ext3_write_super, .sync_fs = ext3_sync_fs, - .write_super_lockfs = ext3_write_super_lockfs, - .unlockfs = ext3_unlockfs, + .freeze_fs = ext3_freeze, + .unfreeze_fs = ext3_unfreeze, .statfs = ext3_statfs, .remount_fs = ext3_remount, .clear_inode = ext3_clear_inode, @@ -2311,21 +2311,23 @@ static int ext3_create_journal(struct super_block * sb, return 0; } -static void ext3_commit_super (struct super_block * sb, - struct ext3_super_block * es, +static int ext3_commit_super(struct super_block *sb, + struct ext3_super_block *es, int sync) { struct buffer_head *sbh = EXT3_SB(sb)->s_sbh; + int error = 0; if (!sbh) - return; + return error; es->s_wtime = cpu_to_le32(get_seconds()); es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb)); es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb)); BUFFER_TRACE(sbh, "marking dirty"); mark_buffer_dirty(sbh); if (sync) - sync_dirty_buffer(sbh); + error = sync_dirty_buffer(sbh); + return error; } @@ -2439,12 +2441,14 @@ static int ext3_sync_fs(struct super_block *sb, int wait) * LVM calls this function before a (read-only) snapshot is created. This * gives us a chance to flush the journal completely and mark the fs clean. */ -static void ext3_write_super_lockfs(struct super_block *sb) +static int ext3_freeze(struct super_block *sb) { + int error = 0; + journal_t *journal; sb->s_dirt = 0; if (!(sb->s_flags & MS_RDONLY)) { - journal_t *journal = EXT3_SB(sb)->s_journal; + journal = EXT3_SB(sb)->s_journal; /* Now we set up the journal barrier. */ journal_lock_updates(journal); @@ -2453,20 +2457,28 @@ static void ext3_write_super_lockfs(struct super_block *sb) * We don't want to clear needs_recovery flag when we failed * to flush the journal. */ - if (journal_flush(journal) < 0) - return; + error = journal_flush(journal); + if (error < 0) + goto out; /* Journal blocked and flushed, clear needs_recovery flag. */ EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); - ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); + error = ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); + if (error) + goto out; } + return 0; + +out: + journal_unlock_updates(journal); + return error; } /* * Called by LVM after the snapshot is done. We need to reset the RECOVER * flag here, even though the filesystem is not technically dirty yet. */ -static void ext3_unlockfs(struct super_block *sb) +static int ext3_unfreeze(struct super_block *sb) { if (!(sb->s_flags & MS_RDONLY)) { lock_super(sb); @@ -2476,6 +2488,7 @@ static void ext3_unlockfs(struct super_block *sb) unlock_super(sb); journal_unlock_updates(EXT3_SB(sb)->s_journal); } + return 0; } static int ext3_remount (struct super_block * sb, int * flags, char * data) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8f7e0be8ab1b..e5f06a5f045e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -51,7 +51,7 @@ struct proc_dir_entry *ext4_proc_root; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); -static void ext4_commit_super(struct super_block *sb, +static int ext4_commit_super(struct super_block *sb, struct ext4_super_block *es, int sync); static void ext4_mark_recovery_complete(struct super_block *sb, struct ext4_super_block *es); @@ -62,9 +62,9 @@ static const char *ext4_decode_error(struct super_block *sb, int errno, char nbuf[16]); static int ext4_remount(struct super_block *sb, int *flags, char *data); static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); -static void ext4_unlockfs(struct super_block *sb); +static int ext4_unfreeze(struct super_block *sb); static void ext4_write_super(struct super_block *sb); -static void ext4_write_super_lockfs(struct super_block *sb); +static int ext4_freeze(struct super_block *sb); ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, @@ -978,8 +978,8 @@ static const struct super_operations ext4_sops = { .put_super = ext4_put_super, .write_super = ext4_write_super, .sync_fs = ext4_sync_fs, - .write_super_lockfs = ext4_write_super_lockfs, - .unlockfs = ext4_unlockfs, + .freeze_fs = ext4_freeze, + .unfreeze_fs = ext4_unfreeze, .statfs = ext4_statfs, .remount_fs = ext4_remount, .clear_inode = ext4_clear_inode, @@ -2888,13 +2888,14 @@ static int ext4_load_journal(struct super_block *sb, return 0; } -static void ext4_commit_super(struct super_block *sb, +static int ext4_commit_super(struct super_block *sb, struct ext4_super_block *es, int sync) { struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; + int error = 0; if (!sbh) - return; + return error; if (buffer_write_io_error(sbh)) { /* * Oh, dear. A previous attempt to write the @@ -2918,14 +2919,19 @@ static void ext4_commit_super(struct super_block *sb, BUFFER_TRACE(sbh, "marking dirty"); mark_buffer_dirty(sbh); if (sync) { - sync_dirty_buffer(sbh); - if (buffer_write_io_error(sbh)) { + error = sync_dirty_buffer(sbh); + if (error) + return error; + + error = buffer_write_io_error(sbh); + if (error) { printk(KERN_ERR "EXT4-fs: I/O error while writing " "superblock for %s.\n", sb->s_id); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); } } + return error; } @@ -3058,12 +3064,14 @@ static int ext4_sync_fs(struct super_block *sb, int wait) * LVM calls this function before a (read-only) snapshot is created. This * gives us a chance to flush the journal completely and mark the fs clean. */ -static void ext4_write_super_lockfs(struct super_block *sb) +static int ext4_freeze(struct super_block *sb) { + int error = 0; + journal_t *journal; sb->s_dirt = 0; if (!(sb->s_flags & MS_RDONLY)) { - journal_t *journal = EXT4_SB(sb)->s_journal; + journal = EXT4_SB(sb)->s_journal; if (journal) { /* Now we set up the journal barrier. */ @@ -3073,21 +3081,29 @@ static void ext4_write_super_lockfs(struct super_block *sb) * We don't want to clear needs_recovery flag when we * failed to flush the journal. */ - if (jbd2_journal_flush(journal) < 0) - return; + error = jbd2_journal_flush(journal); + if (error < 0) + goto out; } /* Journal blocked and flushed, clear needs_recovery flag. */ EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); + error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); + if (error) + goto out; } + return 0; +out: + jbd2_journal_unlock_updates(journal); + return error; } /* * Called by LVM after the snapshot is done. We need to reset the RECOVER * flag here, even though the filesystem is not technically dirty yet. */ -static void ext4_unlockfs(struct super_block *sb) +static int ext4_unfreeze(struct super_block *sb) { if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) { lock_super(sb); @@ -3097,6 +3113,7 @@ static void ext4_unlockfs(struct super_block *sb) unlock_super(sb); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); } + return 0; } static int ext4_remount(struct super_block *sb, int *flags, char *data) diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c index 777783deddcb..320323d03479 100644 --- a/fs/gfs2/ops_super.c +++ b/fs/gfs2/ops_super.c @@ -211,18 +211,18 @@ static int gfs2_sync_fs(struct super_block *sb, int wait) } /** - * gfs2_write_super_lockfs - prevent further writes to the filesystem + * gfs2_freeze - prevent further writes to the filesystem * @sb: the VFS structure for the filesystem * */ -static void gfs2_write_super_lockfs(struct super_block *sb) +static int gfs2_freeze(struct super_block *sb) { struct gfs2_sbd *sdp = sb->s_fs_info; int error; if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) - return; + return -EINVAL; for (;;) { error = gfs2_freeze_fs(sdp); @@ -242,17 +242,19 @@ static void gfs2_write_super_lockfs(struct super_block *sb) fs_err(sdp, "retrying...\n"); msleep(1000); } + return 0; } /** - * gfs2_unlockfs - reallow writes to the filesystem + * gfs2_unfreeze - reallow writes to the filesystem * @sb: the VFS structure for the filesystem * */ -static void gfs2_unlockfs(struct super_block *sb) +static int gfs2_unfreeze(struct super_block *sb) { gfs2_unfreeze_fs(sb->s_fs_info); + return 0; } /** @@ -688,8 +690,8 @@ const struct super_operations gfs2_super_ops = { .put_super = gfs2_put_super, .write_super = gfs2_write_super, .sync_fs = gfs2_sync_fs, - .write_super_lockfs = gfs2_write_super_lockfs, - .unlockfs = gfs2_unlockfs, + .freeze_fs = gfs2_freeze, + .unfreeze_fs = gfs2_unfreeze, .statfs = gfs2_statfs, .remount_fs = gfs2_remount_fs, .clear_inode = gfs2_clear_inode, diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 0dae345e481b..b37d1f78b854 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -543,7 +543,7 @@ out_kfree: return ret; } -static void jfs_write_super_lockfs(struct super_block *sb) +static int jfs_freeze(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); struct jfs_log *log = sbi->log; @@ -553,9 +553,10 @@ static void jfs_write_super_lockfs(struct super_block *sb) lmLogShutdown(log); updateSuper(sb, FM_CLEAN); } + return 0; } -static void jfs_unlockfs(struct super_block *sb) +static int jfs_unfreeze(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); struct jfs_log *log = sbi->log; @@ -568,6 +569,7 @@ static void jfs_unlockfs(struct super_block *sb) else txResume(sb); } + return 0; } static int jfs_get_sb(struct file_system_type *fs_type, @@ -735,8 +737,8 @@ static const struct super_operations jfs_super_operations = { .delete_inode = jfs_delete_inode, .put_super = jfs_put_super, .sync_fs = jfs_sync_fs, - .write_super_lockfs = jfs_write_super_lockfs, - .unlockfs = jfs_unlockfs, + .freeze_fs = jfs_freeze, + .unfreeze_fs = jfs_unfreeze, .statfs = jfs_statfs, .remount_fs = jfs_remount, .show_options = jfs_show_options, diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index c55651f1407c..f3c820b75829 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -83,7 +83,7 @@ static void reiserfs_write_super(struct super_block *s) reiserfs_sync_fs(s, 1); } -static void reiserfs_write_super_lockfs(struct super_block *s) +static int reiserfs_freeze(struct super_block *s) { struct reiserfs_transaction_handle th; reiserfs_write_lock(s); @@ -101,11 +101,13 @@ static void reiserfs_write_super_lockfs(struct super_block *s) } s->s_dirt = 0; reiserfs_write_unlock(s); + return 0; } -static void reiserfs_unlockfs(struct super_block *s) +static int reiserfs_unfreeze(struct super_block *s) { reiserfs_allow_writes(s); + return 0; } extern const struct in_core_key MAX_IN_CORE_KEY; @@ -613,8 +615,8 @@ static const struct super_operations reiserfs_sops = { .put_super = reiserfs_put_super, .write_super = reiserfs_write_super, .sync_fs = reiserfs_sync_fs, - .write_super_lockfs = reiserfs_write_super_lockfs, - .unlockfs = reiserfs_unlockfs, + .freeze_fs = reiserfs_freeze, + .unfreeze_fs = reiserfs_unfreeze, .statfs = reiserfs_statfs, .remount_fs = reiserfs_remount, .show_options = generic_show_options, diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index be846d606ae8..95a971080368 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1269,14 +1269,14 @@ xfs_fs_remount( * need to take care of the metadata. Once that's done write a dummy * record to dirty the log in case of a crash while frozen. */ -STATIC void -xfs_fs_lockfs( +STATIC int +xfs_fs_freeze( struct super_block *sb) { struct xfs_mount *mp = XFS_M(sb); xfs_quiesce_attr(mp); - xfs_fs_log_dummy(mp); + return -xfs_fs_log_dummy(mp); } STATIC int @@ -1557,7 +1557,7 @@ static struct super_operations xfs_super_operations = { .put_super = xfs_fs_put_super, .write_super = xfs_fs_write_super, .sync_fs = xfs_fs_sync_super, - .write_super_lockfs = xfs_fs_lockfs, + .freeze_fs = xfs_fs_freeze, .statfs = xfs_fs_statfs, .remount_fs = xfs_fs_remount, .show_options = xfs_fs_show_options, diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 852b6d32e8d0..680d0e0ec932 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -595,17 +595,19 @@ out: return 0; } -void +int xfs_fs_log_dummy( xfs_mount_t *mp) { xfs_trans_t *tp; xfs_inode_t *ip; + int error; tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1); - if (xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0)) { + error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); + if (error) { xfs_trans_cancel(tp, 0); - return; + return error; } ip = mp->m_rootip; @@ -615,9 +617,10 @@ xfs_fs_log_dummy( xfs_trans_ihold(tp, ip); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); xfs_trans_set_sync(tp); - xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; } int diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index 300d0c9d61ad..88435e0a77c9 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, xfs_fsop_resblks_t *outval); extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); -extern void xfs_fs_log_dummy(xfs_mount_t *mp); +extern int xfs_fs_log_dummy(xfs_mount_t *mp); #endif /* __XFS_FSOPS_H__ */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 0b87b29f4797..3e59182de9df 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1377,8 +1377,8 @@ struct super_operations { void (*put_super) (struct super_block *); void (*write_super) (struct super_block *); int (*sync_fs)(struct super_block *sb, int wait); - void (*write_super_lockfs) (struct super_block *); - void (*unlockfs) (struct super_block *); + int (*freeze_fs) (struct super_block *); + int (*unfreeze_fs) (struct super_block *); int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); void (*clear_inode) (struct inode *); -- cgit v1.2.3